]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Add trivial move support in CompactFiles API (#14112)
authorXingbo Wang <xbw@fb.com>
Mon, 10 Nov 2025 23:20:50 +0000 (15:20 -0800)
committerJay Huh <jewoongh@meta.com>
Mon, 10 Nov 2025 23:38:32 +0000 (15:38 -0800)
Summary:
Support trivial move in CompactFiles API, which is not supported previously.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14112

Test Plan: Unit test

Reviewed By: cbi42

Differential Revision: D86546150

Pulled By: xingbowang

fbshipit-source-id: 08a3ae9a055f3d3d41711403b1695f44977e6ea8

db/compact_files_test.cc
db/db_impl/db_impl.h
db/db_impl/db_impl_compaction_flush.cc
include/rocksdb/options.h
unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md [new file with mode: 0644]

index 83bec82b94c737b0e0ab2683d366ff7cc11958e7..b1331d1ccff98c716015ac5675b7649f6ff745b9 100644 (file)
@@ -534,6 +534,232 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   delete db;
 }
 
+// Helper function to generate zero-padded keys
+// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42"
+static std::string MakeKey(const std::string& prefix, int index) {
+  return prefix + (index < 10 ? "0" : "") + std::to_string(index);
+}
+
+TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // Create 3 non-overlapping files in L0
+  // File 1: keys [a00-a99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 2: keys [b00-b99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("b", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 3: keys [c00-c99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("c", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Verify files are in L0
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 3);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);
+
+  // Get L0 files
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+  // Compact all L0 files to L1 (non-overlapping in L1)
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1));
+
+  // Verify files are now in L1
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 0);
+  ASSERT_EQ(meta.levels[1].files.size(), 3);
+
+  // Get the first file from L1 (should be the one with keys a00-a99)
+  std::string l1_file_to_move;
+  std::vector<std::string> l1_files_to_move_later;
+  uint64_t l1_file_number = 0;
+  for (const auto& file : meta.levels[1].files) {
+    if (file.smallestkey[0] == 'a') {
+      l1_file_to_move = file.db_path + "/" + file.name;
+      l1_file_number = file.file_number;
+    } else {
+      l1_files_to_move_later.push_back(file.db_path + "/" + file.name);
+    }
+  }
+  ASSERT_FALSE(l1_file_to_move.empty());
+
+  // Set up sync point to verify trivial move path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Move the file from L1 to L6 - this should be a trivial move
+  // because the file doesn't overlap with anything in L6
+  std::vector<std::string> files_to_move = {l1_file_to_move};
+  ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6));
+
+  // Verify trivial move was executed
+  ASSERT_TRUE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the file is now in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 2);  // Two files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 1);  // One file in L6
+
+  // Verify it's the correct file in L6
+  bool found_file_in_l6 = false;
+  for (const auto& file : meta.levels[6].files) {
+    if (file.file_number == l1_file_number) {
+      found_file_in_l6 = true;
+      // Verify key range hasn't changed
+      ASSERT_EQ(file.smallestkey[0], 'a');
+      ASSERT_EQ(file.largestkey[0], 'a');
+      break;
+    }
+  }
+  ASSERT_TRUE(found_file_in_l6);
+
+  // Move the other 2 files from L1 to L6, with allow_trivial_move set to false.
+  // This will trigger a normal compaction, so the 2 files will be compacted
+  // into a single file in L6.
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6));
+
+  // Verify files in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // Zero files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 2);  // Two file in L6
+
+  // Verify data integrity - all keys should still be readable
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "value_" + key);
+  }
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.num_levels = 7;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // Create a file in L6 with keys [m00-m99] (wide range)
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("m", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get L0 file
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+
+  // Move to L6
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6));
+
+  // Now create a file in L1 with overlapping keys [m50-m60]
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get the L0 file
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files_2;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files_2.push_back(file.db_path + "/" + file.name);
+  }
+
+  // Move to L1
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1));
+
+  // Get the L1 file
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 1);
+  std::string l1_file =
+      meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name;
+
+  // Set up sync point to verify full compaction path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Try to move from L1 to L6 - this should NOT be a trivial move
+  // because the file overlaps with the existing file in L6
+  ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6));
+
+  // Verify trivial move was NOT executed (full compaction happened)
+  ASSERT_FALSE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the result - should have merged data in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // L1 should be empty
+  // L6 should have the merged file (may be 1 file if merged, or 2 if not)
+  ASSERT_GE(meta.levels[6].files.size(), 1);
+
+  // Verify updated values are present
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "updated_value_" + key);
+  }
+
+  delete db;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
index da1879688e56d99e653628c2e6ac3ca1226e136c..c3d045725242f6989a27759ebd8d1365922643ba 100644 (file)
@@ -2396,6 +2396,14 @@ class DBImpl : public DB {
                           JobContext* job_context, LogBuffer* log_buffer,
                           CompactionJobInfo* compaction_job_info);
 
+  // Helper function to perform trivial move by updating manifest metadata
+  // without rewriting data files. This is called when IsTrivialMove() is true.
+  // REQUIRES: mutex held
+  // Returns: Status of the trivial move operation
+  Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                            bool& compaction_released, size_t& moved_files,
+                            size_t& moved_bytes);
+
   // REQUIRES: mutex unlocked
   void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
                            bool track);
index 5e883874715e675109d1f616d3e0a5efef2c6756..9f4d08e938ee18371654992006e8acfa964b032e 100644 (file)
@@ -1424,6 +1424,56 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   return s;
 }
 
+Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                                  bool& compaction_released,
+                                  size_t& moved_files, size_t& moved_bytes) {
+  mutex_.AssertHeld();
+
+  ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n",
+                   c.column_family_data()->GetName().c_str(),
+                   static_cast<int>(c.num_input_files(0)), c.output_level());
+
+  // Move files to the output level by editing the manifest
+  for (unsigned int l = 0; l < c.num_input_levels(); l++) {
+    if (c.level(l) == c.output_level()) {
+      continue;
+    }
+    for (size_t i = 0; i < c.num_input_files(l); i++) {
+      FileMetaData* f = c.input(l, i);
+      c.edit()->DeleteFile(c.level(l), f->fd.GetNumber());
+      c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+                        f->fd.GetFileSize(), f->smallest, f->largest,
+                        f->fd.smallest_seqno, f->fd.largest_seqno,
+                        f->marked_for_compaction, f->temperature,
+                        f->oldest_blob_file_number, f->oldest_ancester_time,
+                        f->file_creation_time, f->epoch_number,
+                        f->file_checksum, f->file_checksum_func_name,
+                        f->unique_id, f->compensated_range_deletion_size,
+                        f->tail_size, f->user_defined_timestamps_persisted);
+      moved_bytes += static_cast<size_t>(c.input(l, i)->fd.GetFileSize());
+      ROCKS_LOG_BUFFER(
+          log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+          c.column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+          c.output_level(), f->fd.GetFileSize());
+    }
+    moved_files += c.num_input_files(l);
+  }
+
+  // Install the new version
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  const WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status status = versions_->LogAndApply(
+      c.column_family_data(), read_options, write_options, c.edit(), &mutex_,
+      directories_.GetDbDir(), /*new_descriptor_log=*/false,
+      /*column_family_options=*/nullptr,
+      [&c, &compaction_released](const Status& s) {
+        c.ReleaseCompactionFiles(s);
+        compaction_released = true;
+      });
+
+  return status;
+}
+
 Status DBImpl::CompactFilesImpl(
     const CompactionOptions& compact_options, ColumnFamilyData* cfd,
     Version* version, const std::vector<std::string>& input_file_names,
@@ -1511,6 +1561,63 @@ Status DBImpl::CompactFilesImpl(
   // deletion compaction currently not allowed in CompactFiles.
   assert(!c->deletion_compaction());
 
+  // Check if this can be a trivial move (metadata-only update)
+  // Similar to the logic in DBImpl::BackgroundCompaction
+  // Note: We disable trivial move when compaction_service is present because
+  // the service expects all compactions to go through CompactionJob for
+  // tracking
+  bool is_trivial_move = compact_options.allow_trivial_move &&
+                         c->IsTrivialMove() &&
+                         immutable_db_options().compaction_service == nullptr;
+
+  if (is_trivial_move) {
+    // Perform trivial move: just update manifest without rewriting data
+    TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove");
+
+    bool compaction_released = false;
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    Status status = PerformTrivialMove(
+        *c.get(), log_buffer, compaction_released, moved_files, moved_bytes);
+
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(
+          c->column_family_data(), job_context->superversion_contexts.data());
+
+      // Populate output file names for trivial move
+      if (output_file_names != nullptr) {
+        for (const auto& newf : c->edit()->GetNewFiles()) {
+          output_file_names->push_back(TableFileName(
+              c->immutable_options().cf_paths, newf.second.fd.GetNumber(),
+              newf.second.fd.GetPathId()));
+        }
+      }
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Trivial move succeeded for %zu files, %zu bytes total\n",
+          c->column_family_data()->GetName().c_str(), moved_files, moved_bytes);
+    } else {
+      if (!compaction_released) {
+        c->ReleaseCompactionFiles(status);
+      }
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n",
+                       c->column_family_data()->GetName().c_str(),
+                       status.ToString().c_str());
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+
+    c.reset();
+    bg_compaction_scheduled_--;
+    if (bg_compaction_scheduled_ == 0) {
+      bg_cv_.SignalAll();
+    }
+    MaybeScheduleFlushOrCompaction();
+
+    return status;
+  }
+
+  // Not a trivial move, proceed with full compaction
   InitSnapshotContext(job_context);
 
   std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
@@ -4074,35 +4181,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
 
-    // Move files to next level
-    int32_t moved_files = 0;
-    int64_t moved_bytes = 0;
-    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
-      if (c->level(l) == c->output_level()) {
-        continue;
-      }
-      for (size_t i = 0; i < c->num_input_files(l); i++) {
-        FileMetaData* f = c->input(l, i);
-        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
-        c->edit()->AddFile(
-            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
-            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
-            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
-            f->oldest_blob_file_number, f->oldest_ancester_time,
-            f->file_creation_time, f->epoch_number, f->file_checksum,
-            f->file_checksum_func_name, f->unique_id,
-            f->compensated_range_deletion_size, f->tail_size,
-            f->user_defined_timestamps_persisted);
-
-        ROCKS_LOG_BUFFER(
-            log_buffer,
-            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
-            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
-            c->output_level(), f->fd.GetFileSize());
-        ++moved_files;
-        moved_bytes += f->fd.GetFileSize();
-      }
-    }
     if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
         c->immutable_options().compaction_pri == kRoundRobin) {
       int start_level = c->start_level();
@@ -4113,14 +4191,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
             vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
       }
     }
-    status = versions_->LogAndApply(
-        c->column_family_data(), read_options, write_options, c->edit(),
-        &mutex_, directories_.GetDbDir(),
-        /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
-        [&c, &compaction_released](const Status& s) {
-          c->ReleaseCompactionFiles(s);
-          compaction_released = true;
-        });
+
+    // Perform the trivial move
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    status = PerformTrivialMove(*c.get(), log_buffer, compaction_released,
+                                moved_files, moved_bytes);
     io_s = versions_->io_status();
     InstallSuperVersionAndScheduleWork(
         c->column_family_data(), job_context->superversion_contexts.data());
@@ -4135,8 +4211,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           << "total_files_size" << moved_bytes;
     }
     ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n",
         c->column_family_data()->GetName().c_str(), moved_files,
         c->output_level(), moved_bytes, status.ToString().c_str(),
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
index aeca38ec248763d52e2abcef079ac5edb15ea432..5fbc3324d024bdfbbf7492a814db9f56d0b9a39e 100644 (file)
@@ -2383,11 +2383,17 @@ struct CompactionOptions {
   // "default_write_temperature"
   Temperature output_temperature_override = Temperature::kUnknown;
 
+  // Option to optimize the manual compaction by enabling trivial move for non
+  // overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
   CompactionOptions()
       : compression(kDisableCompressionOption),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
         max_subcompactions(0),
-        canceled(nullptr) {}
+        canceled(nullptr),
+        allow_trivial_move(false) {}
 };
 
 // For level based compaction, we can configure if we want to skip/force
diff --git a/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
new file mode 100644 (file)
index 0000000..4c52fc3
--- /dev/null
@@ -0,0 +1 @@
+Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.