]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Include estimated bytes deleted by range tombstones in compensated file size (#10734)
authorChangyu Bi <changyubi@fb.com>
Thu, 29 Dec 2022 21:28:24 +0000 (13:28 -0800)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Thu, 29 Dec 2022 21:28:24 +0000 (13:28 -0800)
Summary:
compensate file sizes in compaction picking so files with range tombstones are preferred, such that they get compacted down earlier as they tend to delete a lot of data. This PR adds a `compensated_range_deletion_size` field in FileMeta that is computed during Flush/Compaction and persisted in MANIFEST. This value is added to `compensated_file_size` which will be used for compaction picking. Currently, for a file in level L, `compensated_range_deletion_size` is set to the estimated bytes deleted by range tombstone of this file in all levels > L. This helps to reduce space amp when data in older levels are covered by range tombstones in level L.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10734

Test Plan:
- Added unit tests.
- benchmark to check if the above definition `compensated_range_deletion_size` is reducing space amp as intended, without affecting write amp too much. The experiment set up favorable for this optimization: large range tombstone issued infrequently. Command used:
```
./db_bench -benchmarks=fillrandom,waitforcompaction,stats,levelstats -use_existing_db=false -avoid_flush_during_recovery=true -write_buffer_size=33554432 -level_compaction_dynamic_level_bytes=true -max_background_jobs=8 -max_bytes_for_level_base=134217728 -target_file_size_base=33554432 -writes_per_range_tombstone=500000 -range_tombstone_width=5000000 -num=50000000 -benchmark_write_rate_limit=8388608 -threads=16 -duration=1800 --max_num_range_tombstones=1000000000
```

In this experiment, each thread wrote 16 range tombstones over the duration of 30 minutes, each range tombstone has width 5M that is the 10% of the key space width. Results shows this PR generates a smaller DB size.

Compaction stats from this PR:
```
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0      2/0   31.54 MB   0.5      0.0     0.0      0.0       8.4      8.4       0.0   1.0      0.0     63.4    135.56            110.94       544    0.249       0      0       0.0       0.0
  L4      3/0   96.55 MB   0.8     18.5     6.7     11.8      18.4      6.6       0.0   2.7     65.3     64.9    290.08            284.03       108    2.686    284M  1957K       0.0       0.0
  L5     15/0   404.41 MB   1.0     19.1     7.7     11.4      18.8      7.4       0.3   2.5     66.6     65.7    292.93            285.34       220    1.332    293M  3808K       0.0       0.0
  L6    143/0    4.12 GB   0.0     45.0     7.5     37.5      41.6      4.1       0.0   5.5     71.2     65.9    647.00            632.66       251    2.578    739M    47M       0.0       0.0
 Sum    163/0    4.64 GB   0.0     82.6    21.9     60.7      87.2     26.5       0.3  10.4     61.9     65.4   1365.58           1312.97      1123    1.216   1318M    52M       0.0       0.0
```

Compaction stats from main:
```
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0      0/0    0.00 KB   0.0      0.0     0.0      0.0       8.4      8.4       0.0   1.0      0.0     60.5    142.12            115.89       569    0.250       0      0       0.0       0.0
  L4      3/0   85.68 MB   1.0     17.7     6.8     10.9      17.6      6.7       0.0   2.6     62.7     62.3    289.05            281.79       112    2.581    272M  2309K       0.0       0.0
  L5     11/0   293.73 MB   1.0     18.8     7.5     11.2      18.5      7.2       0.5   2.5     64.9     63.9    296.07            288.50       220    1.346    288M  4365K       0.0       0.0
  L6    130/0    3.94 GB   0.0     51.5     7.6     43.9      47.9      3.9       0.0   6.3     67.2     62.4    784.95            765.92       258    3.042    848M    51M       0.0       0.0
 Sum    144/0    4.31 GB   0.0     88.0    21.9     66.0      92.3     26.3       0.5  11.0     59.6     62.5   1512.19           1452.09      1159    1.305   1409M    58M       0.0       0.0```

Reviewed By: ajkr

Differential Revision: D39834713

Pulled By: cbi42

fbshipit-source-id: fe9341040b8704a8fbb10cad5cf5c43e962c7e6b

20 files changed:
db/builder.cc
db/builder.h
db/compaction/compaction_job_test.cc
db/compaction/compaction_outputs.cc
db/compaction/compaction_picker_test.cc
db/db_impl/db_impl_compaction_flush.cc
db/db_impl/db_impl_experimental.cc
db/db_impl/db_impl_open.cc
db/db_range_del_test.cc
db/experimental.cc
db/external_sst_file_ingestion_job.cc
db/flush_job.cc
db/import_column_family_job.cc
db/repair.cc
db/version_builder_test.cc
db/version_edit.cc
db/version_edit.h
db/version_edit_test.cc
db/version_set.cc
db/version_set_test.cc

index 9283ffd64d88ab5c2356392890cf9d60763011f8..cb7769e969e755525e3f694fb12ed28a6d3c6e8d 100644 (file)
@@ -71,8 +71,9 @@ Status BuildTable(
     int job_id, const Env::IOPriority io_priority,
     TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
     const std::string* full_history_ts_low,
-    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
-    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+    BlobFileCompletionCallback* blob_callback, Version* version,
+    uint64_t* num_input_entries, uint64_t* memtable_payload_bytes,
+    uint64_t* memtable_garbage_bytes) {
   assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          tboptions.column_family_name.empty());
@@ -246,9 +247,17 @@ Status BuildTable(
         auto tombstone = range_del_it->Tombstone();
         auto kv = tombstone.Serialize();
         builder->Add(kv.first.Encode(), kv.second);
-        meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
-                                       tombstone.seq_,
+        InternalKey tombstone_end = tombstone.SerializeEndKey();
+        meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
                                        tboptions.internal_comparator);
+        if (version) {
+          SizeApproximationOptions approx_opts;
+          approx_opts.files_size_error_margin = 0.1;
+          meta->compensated_range_deletion_size += versions->ApproximateSize(
+              approx_opts, version, kv.first.Encode(), tombstone_end.Encode(),
+              0 /* start_level */, -1 /* end_level */,
+              TableReaderCaller::kFlush);
+        }
       }
     }
 
index a028fd2ba380f70ca1f729a0bfc97ddf0b721d62..063da5ca9edaffbfbd5cff9f7d0dcffdfb1cd07c 100644 (file)
@@ -13,6 +13,7 @@
 #include "db/range_tombstone_fragmenter.h"
 #include "db/seqno_to_time_mapping.h"
 #include "db/table_properties_collector.h"
+#include "db/version_set.h"
 #include "logging/event_logger.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
@@ -70,7 +71,7 @@ extern Status BuildTable(
     Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
     const std::string* full_history_ts_low = nullptr,
     BlobFileCompletionCallback* blob_callback = nullptr,
-    uint64_t* num_input_entries = nullptr,
+    Version* version = nullptr, uint64_t* num_input_entries = nullptr,
     uint64_t* memtable_payload_bytes = nullptr,
     uint64_t* memtable_garbage_bytes = nullptr);
 
index 8b312ea78da4541f2e799cef8ed6be20ed56f286..0f0c5daf776cab1a9d8bd643b28d866309681d6d 100644 (file)
@@ -386,7 +386,8 @@ class CompactionJobTestBase : public testing::Test {
         oldest_blob_file_number, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime,
         versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
+        0);
 
     mutex_.Lock();
     EXPECT_OK(
index 072221586504b94a39530c0ce91681ee46cd4cb7..e1fa21b4f9962c24d3373bb2358780b125168be1 100644 (file)
@@ -525,7 +525,8 @@ Status CompactionOutputs::AddRangeDels(
            ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
     // Range tombstone is not supported by output validator yet.
     builder_->Add(kv.first.Encode(), kv.second);
-    InternalKey smallest_candidate = std::move(kv.first);
+    InternalKey tombstone_start = std::move(kv.first);
+    InternalKey smallest_candidate{tombstone_start};
     if (lower_bound != nullptr &&
         ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
                                       *lower_bound) <= 0) {
@@ -594,7 +595,8 @@ Status CompactionOutputs::AddRangeDels(
         smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
       }
     }
-    InternalKey largest_candidate = tombstone.SerializeEndKey();
+    InternalKey tombstone_end = tombstone.SerializeEndKey();
+    InternalKey largest_candidate{tombstone_end};
     if (upper_bound != nullptr &&
         ucmp->CompareWithoutTimestamp(*upper_bound,
                                       largest_candidate.user_key()) <= 0) {
@@ -636,6 +638,24 @@ Status CompactionOutputs::AddRangeDels(
 #endif
     meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
                                   tombstone.seq_, icmp);
+    if (!bottommost_level) {
+      // Range tombstones are truncated at file boundaries
+      if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
+        tombstone_start = meta.smallest;
+      }
+      if (icmp.Compare(tombstone_end, meta.largest) > 0) {
+        tombstone_end = meta.largest;
+      }
+      SizeApproximationOptions approx_opts;
+      approx_opts.files_size_error_margin = 0.1;
+      auto approximate_covered_size =
+          compaction_->input_version()->version_set()->ApproximateSize(
+              approx_opts, compaction_->input_version(),
+              tombstone_start.Encode(), tombstone_end.Encode(),
+              compaction_->output_level() + 1 /* start_level */,
+              -1 /* end_level */, kCompaction);
+      meta.compensated_range_deletion_size += approximate_covered_size;
+    }
     // The smallest key in a file is used for range tombstone truncation, so
     // it cannot have a seqnum of 0 (unless the smallest data key in a file
     // has a seqnum of 0). Otherwise, the truncated tombstone may expose
index dfc508fc5bbdefd45a68e25e7445b2bf3edd3946..865518cb200741332ae3311e002ef12fbdc8da45 100644 (file)
@@ -148,7 +148,7 @@ class CompactionPickerTestBase : public testing::Test {
         smallest_seq, largest_seq, marked_for_compact, temperature,
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
     f->oldest_ancester_time = oldest_ancestor_time;
@@ -2873,7 +2873,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   ASSERT_EQ(0, compaction->output_level());
 }
 
-
 #ifndef ROCKSDB_LITE
 TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
   const uint64_t kFileSize = 100000;
index 095336f6433a4be846679195a275446660d4d6fd..95ee948eb2f19574d64eaac2a65af82c7ae5fda7 100644 (file)
@@ -1747,7 +1747,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
           f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
           f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
           f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
-          f->file_checksum, f->file_checksum_func_name, f->unique_id);
+          f->file_checksum, f->file_checksum_func_name, f->unique_id,
+          f->compensated_range_deletion_size);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -3388,7 +3389,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
             f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
             f->oldest_blob_file_number, f->oldest_ancester_time,
             f->file_creation_time, f->epoch_number, f->file_checksum,
-            f->file_checksum_func_name, f->unique_id);
+            f->file_checksum_func_name, f->unique_id,
+            f->compensated_range_deletion_size);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
index 035fdbd412344a97b9f91d78c485032c608a03ff..2f732c1e47d008c9d6e544631cba386d0cdccdb5 100644 (file)
@@ -137,7 +137,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
                    f->marked_for_compaction, f->temperature,
                    f->oldest_blob_file_number, f->oldest_ancester_time,
                    f->file_creation_time, f->epoch_number, f->file_checksum,
-                   f->file_checksum_func_name, f->unique_id);
+                   f->file_checksum_func_name, f->unique_id,
+                   f->compensated_range_deletion_size);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
index f626df9ed311d507e98be432295dbfc3367301d4..3263e94bce56c261091a363b2b5b6df2d89c2886 100644 (file)
@@ -1550,6 +1550,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
           0 /* file_creation_time */, db_id_, db_session_id_,
           0 /* target_file_size */, meta.fd.GetNumber());
       SeqnoToTimeMapping empty_seqno_time_mapping;
+      Version* version = cfd->current();
+      version->Ref();
       s = BuildTable(
           dbname_, versions_.get(), immutable_db_options_, tboptions,
           file_options_for_compaction_, cfd->table_cache(), iter.get(),
@@ -1559,7 +1561,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
           io_tracer_, BlobFileCreationReason::kRecovery,
           empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH,
           nullptr /* table_properties */, write_hint,
-          nullptr /*full_history_ts_low*/, &blob_callback_);
+          nullptr /*full_history_ts_low*/, &blob_callback_, version);
+      version->Unref();
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                       "[%s] [WriteLevel0TableForRecovery]"
@@ -1583,13 +1586,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   constexpr int level = 0;
 
   if (s.ok() && has_output) {
-    edit->AddFile(
-        level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
-        meta.smallest, meta.largest, meta.fd.smallest_seqno,
-        meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
-        meta.oldest_blob_file_number, meta.oldest_ancester_time,
-        meta.file_creation_time, meta.epoch_number, meta.file_checksum,
-        meta.file_checksum_func_name, meta.unique_id);
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
+                  meta.marked_for_compaction, meta.temperature,
+                  meta.oldest_blob_file_number, meta.oldest_ancester_time,
+                  meta.file_creation_time, meta.epoch_number,
+                  meta.file_checksum, meta.file_checksum_func_name,
+                  meta.unique_id, meta.compensated_range_deletion_size);
 
     for (const auto& blob : blob_file_additions) {
       edit->AddBlobFile(blob);
index 3d468a1c0aba0e0a3d8b0486c8383b4365b80cf5..e6f4e0e4d1f18b7560a8d89ebb8758946c7c8403 100644 (file)
@@ -479,7 +479,10 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
       std::vector<std::string> values;
       // Write 100KB (100 values, each 1K)
       for (int k = 0; k < kNumPerFile; k++) {
-        values.push_back(rnd.RandomString(990));
+        // For the highest level, use smaller value size such that it does not
+        // prematurely cause auto compaction due to range tombstone adding
+        // additional compensated file size
+        values.push_back(rnd.RandomString((i == kNumLevels - 2) ? 600 : 990));
         ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
       }
       // put extra key to trigger flush
@@ -492,7 +495,13 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
     }
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-    ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+    if (i == kNumLevels - 2) {
+      // For the highest level, value size is smaller (see Put() above),
+      // so output file number is smaller.
+      ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 2);
+    } else {
+      ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+    }
   }
   // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
   // happen since input level > 0; (2) range deletions are not dropped since
@@ -3004,6 +3013,110 @@ TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
   ASSERT_OK(iter->status());
 }
 
+TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesize) {
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+
+  std::vector<std::string> values;
+  Random rnd(301);
+  // file in L2
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("a", values.back()));
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("b", values.back()));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+  uint64_t l2_size = 0;
+  ASSERT_OK(Size("a", "c", 0 /* cf */, &l2_size));
+  ASSERT_GT(l2_size, 0);
+  // file in L1
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("d", values.back()));
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("e", values.back()));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  uint64_t l1_size = 0;
+  ASSERT_OK(Size("d", "f", 0 /* cf */, &l1_size));
+  ASSERT_GT(l1_size, 0);
+
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Flush());
+  // Range deletion compensated size computed during flush time
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_EQ(level_to_files[0].size(), 1);
+  ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size,
+            l1_size + l2_size);
+  ASSERT_EQ(level_to_files[1].size(), 1);
+  ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, 0);
+  ASSERT_EQ(level_to_files[2].size(), 1);
+  ASSERT_EQ(level_to_files[2][0].compensated_range_deletion_size, 0);
+
+  // Range deletion compensated size computed during compaction time
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_EQ(level_to_files[1].size(), 1);
+  ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size);
+  ASSERT_EQ(level_to_files[2].size(), 1);
+  ASSERT_EQ(level_to_files[2][0].compensated_range_deletion_size, 0);
+}
+
+TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesizePersistDuringReopen) {
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+
+  std::vector<std::string> values;
+  Random rnd(301);
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("a", values.back()));
+  values.push_back(rnd.RandomString(1 << 10));
+  ASSERT_OK(Put("b", values.back()));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_EQ(level_to_files[0].size(), 1);
+  ASSERT_EQ(level_to_files[1].size(), 1);
+  ASSERT_EQ(level_to_files[2].size(), 1);
+  uint64_t l2_size = level_to_files[2][0].fd.GetFileSize();
+  uint64_t l1_size = level_to_files[1][0].fd.GetFileSize();
+  ASSERT_GT(l2_size, 0);
+  ASSERT_GT(l1_size, 0);
+  ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size,
+            l1_size + l2_size);
+  ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size);
+
+  Reopen(opts);
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_EQ(level_to_files[0].size(), 1);
+  ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size,
+            l1_size + l2_size);
+  ASSERT_EQ(level_to_files[1].size(), 1);
+  ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
index cb6286b1f44478c925355f2b161a322ffa872e54..20b5daa970b97d8bee29d69cc47f36256bd4602d 100644 (file)
@@ -113,7 +113,8 @@ Status UpdateManifestForFilesState(
                   lf->fd.largest_seqno, lf->marked_for_compaction, temp,
                   lf->oldest_blob_file_number, lf->oldest_ancester_time,
                   lf->file_creation_time, lf->epoch_number, lf->file_checksum,
-                  lf->file_checksum_func_name, lf->unique_id);
+                  lf->file_checksum_func_name, lf->unique_id,
+                  lf->compensated_range_deletion_size);
             }
           }
         } else {
index dfb9672683aa8869ff72ad870efeef1fc664efab..849f98e874f38b0a5aa038d78e8d5ecc560134af 100644 (file)
@@ -473,7 +473,7 @@ Status ExternalSstFileIngestionJob::Run() {
         ingestion_options_.ingest_behind
             ? kReservedEpochNumberForFileIngestedBehind
             : cfd_->NewEpochNumber(),
-        f.file_checksum, f.file_checksum_func_name, f.unique_id);
+        f.file_checksum, f.file_checksum_func_name, f.unique_id, 0);
     f_metadata.temperature = f.file_temperature;
     edit_.AddFile(f.picked_level, f_metadata);
   }
index c63ccec3e27fb42a4bf5c4f75afd61474c0a993c..ac84da4cae72c11370af0fa57e0c1ddae49e77eb 100644 (file)
@@ -941,7 +941,7 @@ Status FlushJob::WriteLevel0Table() {
           cfd_->internal_stats(), &io_s, io_tracer_,
           BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_,
           job_context_->job_id, io_priority, &table_properties_, write_hint,
-          full_history_ts_low, blob_callback_, &num_input_entries,
+          full_history_ts_low, blob_callback_, base_, &num_input_entries,
           &memtable_payload_bytes, &memtable_garbage_bytes);
       // TODO: Cleanup io_status in BuildTable and table builders
       assert(!s.ok() || io_s.ok());
@@ -1003,8 +1003,7 @@ Status FlushJob::WriteLevel0Table() {
                    meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
                    meta_.file_creation_time, meta_.epoch_number,
                    meta_.file_checksum, meta_.file_checksum_func_name,
-                   meta_.unique_id);
-
+                   meta_.unique_id, meta_.compensated_range_deletion_size);
     edit_->SetBlobFileAdditions(std::move(blob_file_additions));
   }
 #ifndef ROCKSDB_LITE
index c59ef11ab58e6527f2b1930acaf01cdd910a997c..17ad044a7e7a011ebd952a7eec75b0c23d375abe 100644 (file)
@@ -143,7 +143,7 @@ Status ImportColumnFamilyJob::Run() {
         file_metadata.smallest_seqno, file_metadata.largest_seqno, false,
         file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time,
         current_time, file_metadata.epoch_number, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, f.unique_id);
+        kUnknownFileChecksumFuncName, f.unique_id, 0);
     s = dummy_version_builder.Apply(&dummy_version_edit);
   }
   if (s.ok()) {
index ae26f9c6f44605e0cde82026f25c6c70eca26c82..ddec43e9b60d3a3a27c39946510f6cd89a036941 100644 (file)
@@ -665,7 +665,8 @@ class Repairer {
             table->meta.temperature, table->meta.oldest_blob_file_number,
             table->meta.oldest_ancester_time, table->meta.file_creation_time,
             table->meta.epoch_number, table->meta.file_checksum,
-            table->meta.file_checksum_func_name, table->meta.unique_id);
+            table->meta.file_checksum_func_name, table->meta.unique_id,
+            table->meta.compensated_range_deletion_size);
       }
       s = dummy_version_builder.Apply(&dummy_edit);
       if (s.ok()) {
index ed276c65f8c79aceada04e248c77d29ade437426..611dee774b0376d47b2aea0c05cbc11967dee081 100644 (file)
@@ -73,7 +73,7 @@ class VersionBuilderTest : public testing::Test {
         /* marked_for_compact */ false, Temperature::kUnknown,
         oldest_blob_file_number, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
@@ -130,12 +130,13 @@ class VersionBuilderTest : public testing::Test {
     constexpr SequenceNumber largest_seqno = 300;
     constexpr bool marked_for_compaction = false;
 
-    edit->AddFile(
-        level, table_file_number, path_id, file_size, GetInternalKey(smallest),
-        GetInternalKey(largest), smallest_seqno, largest_seqno,
-        marked_for_compaction, Temperature::kUnknown, blob_file_number,
-        kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number,
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    edit->AddFile(level, table_file_number, path_id, file_size,
+                  GetInternalKey(smallest), GetInternalKey(largest),
+                  smallest_seqno, largest_seqno, marked_for_compaction,
+                  Temperature::kUnknown, blob_file_number,
+                  kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                  epoch_number, kUnknownFileChecksum,
+                  kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   }
 
   void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) {
@@ -186,7 +187,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
       2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -233,7 +234,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
       3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
+
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -283,7 +285,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
       4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -319,27 +321,27 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
       2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   EnvOptions env_options;
   constexpr TableCache* table_cache = nullptr;
@@ -378,27 +380,27 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
       2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit.AddFile(
       2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   ASSERT_OK(version_builder.Apply(&version_edit));
 
   VersionEdit version_edit2;
@@ -406,14 +408,14 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
       2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(
       2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
       false, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   ASSERT_OK(version_builder.Apply(&version_edit2));
   ASSERT_OK(version_builder.SaveTo(&new_vstorage));
@@ -524,7 +526,7 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
       GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   ASSERT_OK(builder.Apply(&addition));
 
@@ -573,7 +575,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
       GetInternalKey(largest), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   const Status s = builder.Apply(&edit);
   ASSERT_TRUE(s.IsCorruption());
@@ -609,7 +611,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
       GetInternalKey(largest), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   ASSERT_OK(builder.Apply(&edit));
 
@@ -622,7 +624,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
       GetInternalKey(largest), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   const Status s = builder.Apply(&other_edit);
   ASSERT_TRUE(s.IsCorruption());
@@ -658,7 +660,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {
       GetInternalKey(largest), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
-      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   ASSERT_OK(builder.Apply(&addition));
 
@@ -1231,7 +1233,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
       GetInternalKey(largest), smallest_seqno, largest_seqno,
       marked_for_compaction, Temperature::kUnknown, blob_file_number,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/,
-      checksum_value, checksum_method, kNullUniqueId64x2);
+      checksum_value, checksum_method, kNullUniqueId64x2, 0);
   edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
                    checksum_method, checksum_value);
 
@@ -1319,7 +1321,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
                /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
                kUnknownFileCreationTime, kUnknownEpochNumber,
                kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kNullUniqueId64x2);
+               kNullUniqueId64x2, 0);
 
   edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
                /* file_size */ 100, /* smallest */ GetInternalKey("801"),
@@ -1329,7 +1331,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
                /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
                kUnknownFileCreationTime, kUnknownEpochNumber,
                kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kNullUniqueId64x2);
+               kNullUniqueId64x2, 0);
   edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
                    /* total_blob_bytes */ 200000,
                    /* checksum_method */ std::string(),
@@ -1550,7 +1552,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
       Temperature::kUnknown,
       /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
       kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
-      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   // Add an SST that does not reference any blob files.
   edit.AddFile(
@@ -1560,7 +1562,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
       /* largest_seqno */ 2200, /* marked_for_compaction */ false,
       Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
       kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
-      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   // Delete a file that references a blob file.
   edit.DeleteFile(/* level */ 1, /* file_number */ 6);
@@ -1583,7 +1585,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
                /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
                kUnknownFileCreationTime, kUnknownEpochNumber,
                kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kNullUniqueId64x2);
+               kNullUniqueId64x2, 0);
 
   // Trivially move a file that does not reference any blob files.
   edit.DeleteFile(/* level */ 1, /* file_number */ 13);
@@ -1595,7 +1597,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
                Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                kUnknownEpochNumber, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   // Add one more SST file that references a blob file, then promptly
   // delete it in a second version edit before the new version gets saved.
@@ -1609,7 +1611,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
                /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
                kUnknownFileCreationTime, kUnknownEpochNumber,
                kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kNullUniqueId64x2);
+               kNullUniqueId64x2, 0);
 
   VersionEdit edit2;
 
@@ -1710,7 +1712,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) {
       /* oldest_blob_file_number */ kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
       1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-      kNullUniqueId64x2);
+      kNullUniqueId64x2, 0);
   version_edit_1.AddFile(
       /* level */ 0, /* file_number */ 2U, /* path_id */ 0,
       /* file_size */ 100, /* smallest */ GetInternalKey("b", 2),
@@ -1720,7 +1722,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) {
       /* oldest_blob_file_number */ kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
       1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-      kNullUniqueId64x2);
+      kNullUniqueId64x2, 0);
 
   VersionBuilder version_builder_1(EnvOptions(), &ioptions_,
                                    nullptr /* table_cache */, &vstorage_,
@@ -1747,7 +1749,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) {
       /* oldest_blob_file_number */ kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
       1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-      kNullUniqueId64x2);
+      kNullUniqueId64x2, 0);
   version_edit_2.AddFile(
       /* level */ 0, /* file_number */ 2U, /* path_id */ 0,
       /* file_size */ 100, /* smallest */ GetInternalKey("b", 2),
@@ -1757,7 +1759,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) {
       /* oldest_blob_file_number */ kInvalidBlobFileNumber,
       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
       2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-      kNullUniqueId64x2);
+      kNullUniqueId64x2, 0);
 
   VersionBuilder version_builder_2(EnvOptions(), &ioptions_,
                                    nullptr /* table_cache */, &vstorage_,
index df522607786d7e53e028797077b617d054ec546c..ecddaa49ccd506d5c62a0c156f994fd2eeb6ae2f 100644 (file)
@@ -231,6 +231,13 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
       std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
       PutLengthPrefixedSlice(dst, Slice(unique_id_str));
     }
+    if (f.compensated_range_deletion_size) {
+      PutVarint32(dst, kCompensatedRangeDeletionSize);
+      std::string compensated_range_deletion_size;
+      PutVarint64(&compensated_range_deletion_size,
+                  f.compensated_range_deletion_size);
+      PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
+    }
 
     TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                              dst);
@@ -404,6 +411,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "invalid unique id";
           }
           break;
+        case kCompensatedRangeDeletionSize:
+          if (!GetVarint64(&field, &f.compensated_range_deletion_size)) {
+            return "Invalid compensated range deletion size";
+          }
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
index cfc5f14e572f24ee3f2e456630dadbc6684542d4..8be5f4f52014933f88b2e5b81eb11a4c09c44d2a 100644 (file)
@@ -89,6 +89,7 @@ enum NewFileCustomTag : uint32_t {
   kMaxTimestamp = 11,
   kUniqueId = 12,
   kEpochNumber = 13,
+  kCompensatedRangeDeletionSize = 14,
 
   // If this bit for the custom tag is set, opening DB should fail if
   // we don't know this field.
@@ -182,15 +183,22 @@ struct FileMetaData {
   // Stats for compensating deletion entries during compaction
 
   // File size compensated by deletion entry.
-  // This is updated in Version::UpdateAccumulatedStats() first time when the
-  // file is created or loaded.  After it is updated (!= 0), it is immutable.
+  // This is used to compute a file's compaction priority, and is updated in
+  // Version::ComputeCompensatedSizes() first time when the file is created or
+  // loaded.  After it is updated (!= 0), it is immutable.
   uint64_t compensated_file_size = 0;
   // These values can mutate, but they can only be read or written from
   // single-threaded LogAndApply thread
   uint64_t num_entries = 0;     // the number of entries.
-  uint64_t num_deletions = 0;   // the number of deletion entries.
+  // The number of deletion entries, including range deletions.
+  uint64_t num_deletions = 0;
   uint64_t raw_key_size = 0;    // total uncompressed key size.
   uint64_t raw_value_size = 0;  // total uncompressed value size.
+  uint64_t num_range_deletions = 0;
+  // This is computed during Flush/Compaction, and is added to
+  // `compensated_file_size`. Currently, this estimates the size of keys in the
+  // next level covered by range tombstones in this file.
+  uint64_t compensated_range_deletion_size = 0;
 
   int refs = 0;  // Reference count
 
@@ -240,10 +248,12 @@ struct FileMetaData {
                uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
                uint64_t _epoch_number, const std::string& _file_checksum,
                const std::string& _file_checksum_func_name,
-               UniqueId64x2 _unique_id)
+               UniqueId64x2 _unique_id,
+               const uint64_t _compensated_range_deletion_size)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
+        compensated_range_deletion_size(_compensated_range_deletion_size),
         marked_for_compaction(marked_for_compact),
         temperature(_temperature),
         oldest_blob_file_number(oldest_blob_file),
@@ -434,7 +444,8 @@ class VersionEdit {
                uint64_t oldest_ancester_time, uint64_t file_creation_time,
                uint64_t epoch_number, const std::string& file_checksum,
                const std::string& file_checksum_func_name,
-               const UniqueId64x2& unique_id) {
+               const UniqueId64x2& unique_id,
+               const uint64_t compensated_range_deletion_size) {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
         level,
@@ -442,7 +453,8 @@ class VersionEdit {
                      smallest_seqno, largest_seqno, marked_for_compaction,
                      temperature, oldest_blob_file_number, oldest_ancester_time,
                      file_creation_time, epoch_number, file_checksum,
-                     file_checksum_func_name, unique_id));
+                     file_checksum_func_name, unique_id,
+                     compensated_range_deletion_size));
     if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
       SetLastSequence(largest_seqno);
     }
index 7571291e22c734a5784a9ca33c6a7b902183f7ef..1fa6c00549737bb9a1c6cf44df95c0e1b10d9eb5 100644 (file)
@@ -45,7 +45,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
                  kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
                  kInvalidBlobFileNumber, 888, 678,
                  kBig + 300 + i /* epoch_number */, "234", "crc32c",
-                 kNullUniqueId64x2);
+                 kNullUniqueId64x2, 0);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -65,24 +65,24 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
                kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                300 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
                kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                301 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
                kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                666, 888, 302 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
                InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
                kBig + 603, true, Temperature::kUnknown, 1001,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                303 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   edit.DeleteFile(4, 700);
 
@@ -123,12 +123,12 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
                kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                300 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
                kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
                686, 868, 301 /* epoch_number */, "234", "crc32c",
-               kNullUniqueId64x2);
+               kNullUniqueId64x2, 0);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -177,7 +177,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
                kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                300 /* epoch_number */, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -208,7 +208,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) {
                Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
                1 /*epoch_number*/, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
index 8a8fa1d75bdcdc0e673fbb54247c912086181d2a..be1db7ba3ef5dc47a796e243fb0353aa1db95b16 100644 (file)
@@ -2960,7 +2960,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   file_meta->num_deletions = tp->num_deletions;
   file_meta->raw_value_size = tp->raw_value_size;
   file_meta->raw_key_size = tp->raw_key_size;
-
+  file_meta->num_range_deletions = tp->num_range_deletions;
   return true;
 }
 
@@ -3062,11 +3062,15 @@ void VersionStorageInfo::ComputeCompensatedSizes() {
         // size of deletion entries in a stable workload, the deletion
         // compensation logic might introduce unwanted effet which changes the
         // shape of LSM tree.
-        if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+        if ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 >=
+            file_meta->num_entries) {
           file_meta->compensated_file_size +=
-              (file_meta->num_deletions * 2 - file_meta->num_entries) *
+              ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 -
+               file_meta->num_entries) *
               average_value_size * kDeletionWeightOnCompaction;
         }
+        file_meta->compensated_file_size +=
+            file_meta->compensated_range_deletion_size;
       }
     }
   }
@@ -6215,7 +6219,8 @@ Status VersionSet::WriteCurrentStateToManifest(
                        f->marked_for_compaction, f->temperature,
                        f->oldest_blob_file_number, f->oldest_ancester_time,
                        f->file_creation_time, f->epoch_number, f->file_checksum,
-                       f->file_checksum_func_name, f->unique_id);
+                       f->file_checksum_func_name, f->unique_id,
+                       f->compensated_range_deletion_size);
         }
       }
 
@@ -6293,8 +6298,9 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
   const int num_non_empty_levels = vstorage->num_non_empty_levels();
   end_level = (end_level == -1) ? num_non_empty_levels
                                 : std::min(end_level, num_non_empty_levels);
-
-  assert(start_level <= end_level);
+  if (end_level <= start_level) {
+    return 0;
+  }
 
   // Outline of the optimization that uses options.files_size_error_margin.
   // When approximating the files total size that is used to store a keys range,
index c179f7a6a65e8b548fe5513176a1073e40b870da..9234a4d880ce4db4ac3cc36d4bd063f0cda74105 100644 (file)
@@ -51,7 +51,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
         largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
     files_.push_back(f);
   }
 
@@ -143,16 +143,19 @@ class VersionStorageInfoTestBase : public testing::Test {
 
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0,
-           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber,
+           uint64_t compensated_range_deletion_size = 0) {
     constexpr SequenceNumber dummy_seq = 0;
 
     Add(level, file_number, GetInternalKey(smallest, dummy_seq),
-        GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
+        GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number,
+        compensated_range_deletion_size);
   }
 
   void Add(int level, uint32_t file_number, const InternalKey& smallest,
            const InternalKey& largest, uint64_t file_size = 0,
-           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber,
+           uint64_t compensated_range_deletion_size = 0) {
     assert(level < vstorage_.num_levels());
     FileMetaData* f = new FileMetaData(
         file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
@@ -160,8 +163,7 @@ class VersionStorageInfoTestBase : public testing::Test {
         Temperature::kUnknown, oldest_blob_file_number,
         kUnknownOldestAncesterTime, kUnknownFileCreationTime,
         kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-        kNullUniqueId64x2);
-    f->compensated_file_size = file_size;
+        kNullUniqueId64x2, compensated_range_deletion_size);
     vstorage_.AddFile(level, f);
   }
 
@@ -2136,6 +2138,17 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
   }
 }
 
+TEST_F(VersionStorageInfoTest, AddRangeDeletionCompensatedFileSize) {
+  // Tests that compensated range deletion size is added to compensated file
+  // size.
+  Add(4, 100U, "1", "2", 100U, kInvalidBlobFileNumber, 1000U);
+
+  UpdateVersionStorageInfo();
+
+  auto meta = vstorage_.GetFileMetaDataByNumber(100U);
+  ASSERT_EQ(meta->compensated_file_size, 100U + 1000U);
+}
+
 class VersionSetWithTimestampTest : public VersionSetTest {
  public:
   static const std::string kNewCfName;
@@ -3242,7 +3255,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
       file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
                                ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
                                0, info.epoch_number, kUnknownFileChecksum,
-                               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+                               kUnknownFileChecksumFuncName, kNullUniqueId64x2,
+                               0);
     }
   }
 
@@ -3299,7 +3313,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
         file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(
@@ -3360,7 +3374,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
         file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(