From: Xingbo Wang Date: Mon, 10 Nov 2025 23:20:50 +0000 (-0800) Subject: Add trivial move support in CompactFiles API (#14112) X-Git-Tag: v10.8.3~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f324397aca72ea5dadc3c4c6b1f6452bb1b03ccb;p=rocksdb.git Add trivial move support in CompactFiles API (#14112) Summary: Support trivial move in CompactFiles API, which is not supported previously. Pull Request resolved: https://github.com/facebook/rocksdb/pull/14112 Test Plan: Unit test Reviewed By: cbi42 Differential Revision: D86546150 Pulled By: xingbowang fbshipit-source-id: 08a3ae9a055f3d3d41711403b1695f44977e6ea8 --- diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 83bec82b9..b1331d1cc 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -534,6 +534,232 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { delete db; } +// Helper function to generate zero-padded keys +// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42" +static std::string MakeKey(const std::string& prefix, int index) { + return prefix + (index < 10 ? "0" : "") + std::to_string(index); +} + +TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = false; + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // Create 3 non-overlapping files in L0 + // File 1: keys [a00-a99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("a", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // File 2: keys [b00-b99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("b", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // File 3: keys [c00-c99] + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("c", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Verify files are in L0 + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[0].files.size(), 3); + ASSERT_EQ(meta.levels[1].files.size(), 0); + + // Get L0 files + std::vector l0_files; + for (const auto& file : meta.levels[0].files) { + l0_files.push_back(file.db_path + "/" + file.name); + } + + CompactionOptions compact_option; + compact_option.allow_trivial_move = true; + // Compact all L0 files to L1 (non-overlapping in L1) + ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1)); + + // Verify files are now in L1 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[0].files.size(), 0); + ASSERT_EQ(meta.levels[1].files.size(), 3); + + // Get the first file from L1 (should be the one with keys a00-a99) + std::string l1_file_to_move; + std::vector l1_files_to_move_later; + uint64_t l1_file_number = 0; + for (const auto& file : meta.levels[1].files) { + if (file.smallestkey[0] == 'a') { + l1_file_to_move = file.db_path + "/" + file.name; + l1_file_number = file.file_number; + } else { + l1_files_to_move_later.push_back(file.db_path + "/" + file.name); + } + } + ASSERT_FALSE(l1_file_to_move.empty()); + + // Set up sync point to verify trivial move path is taken + bool trivial_move_executed = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CompactFilesImpl:TrivialMove", + [&](void* /*arg*/) { trivial_move_executed = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Move the file from L1 to L6 - this should be a trivial move + // because the file doesn't overlap with anything in L6 + std::vector files_to_move = {l1_file_to_move}; + ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6)); + + // Verify trivial move was executed + ASSERT_TRUE(trivial_move_executed); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify the file is now in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 2); // Two files remain in L1 + ASSERT_EQ(meta.levels[6].files.size(), 1); // One file in L6 + + // Verify it's the correct file in L6 + bool found_file_in_l6 = false; + for (const auto& file : meta.levels[6].files) { + if (file.file_number == l1_file_number) { + found_file_in_l6 = true; + // Verify key range hasn't changed + ASSERT_EQ(file.smallestkey[0], 'a'); + ASSERT_EQ(file.largestkey[0], 'a'); + break; + } + } + ASSERT_TRUE(found_file_in_l6); + + // Move the other 2 files from L1 to L6, with allow_trivial_move set to false. + // This will trigger a normal compaction, so the 2 files will be compacted + // into a single file in L6. + ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6)); + + // Verify files in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 0); // Zero files remain in L1 + ASSERT_EQ(meta.levels[6].files.size(), 2); // Two file in L6 + + // Verify data integrity - all keys should still be readable + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("a", i); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), key, &value)); + ASSERT_EQ(value, "value_" + key); + } + + delete db; +} + +TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = false; + options.num_levels = 7; + + DB* db = nullptr; + ASSERT_OK(DestroyDB(db_name_, options)); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // Create a file in L6 with keys [m00-m99] (wide range) + for (int i = 0; i < 100; i++) { + std::string key = MakeKey("m", i); + ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Get L0 file + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + std::vector l0_files; + for (const auto& file : meta.levels[0].files) { + l0_files.push_back(file.db_path + "/" + file.name); + } + + CompactionOptions compact_option; + compact_option.allow_trivial_move = true; + + // Move to L6 + ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6)); + + // Now create a file in L1 with overlapping keys [m50-m60] + for (int i = 50; i <= 60; i++) { + std::string key = "m" + std::to_string(i); + ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key)); + } + ASSERT_OK(db->Flush(FlushOptions())); + + // Get the L0 file + db->GetColumnFamilyMetaData(&meta); + std::vector l0_files_2; + for (const auto& file : meta.levels[0].files) { + l0_files_2.push_back(file.db_path + "/" + file.name); + } + + // Move to L1 + ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1)); + + // Get the L1 file + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 1); + std::string l1_file = + meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name; + + // Set up sync point to verify full compaction path is taken + bool trivial_move_executed = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CompactFilesImpl:TrivialMove", + [&](void* /*arg*/) { trivial_move_executed = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Try to move from L1 to L6 - this should NOT be a trivial move + // because the file overlaps with the existing file in L6 + ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6)); + + // Verify trivial move was NOT executed (full compaction happened) + ASSERT_FALSE(trivial_move_executed); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Verify the result - should have merged data in L6 + db->GetColumnFamilyMetaData(&meta); + ASSERT_EQ(meta.levels[1].files.size(), 0); // L1 should be empty + // L6 should have the merged file (may be 1 file if merged, or 2 if not) + ASSERT_GE(meta.levels[6].files.size(), 1); + + // Verify updated values are present + for (int i = 50; i <= 60; i++) { + std::string key = "m" + std::to_string(i); + std::string value; + ASSERT_OK(db->Get(ReadOptions(), key, &value)); + ASSERT_EQ(value, "updated_value_" + key); + } + + delete db; +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index da1879688..c3d045725 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -2396,6 +2396,14 @@ class DBImpl : public DB { JobContext* job_context, LogBuffer* log_buffer, CompactionJobInfo* compaction_job_info); + // Helper function to perform trivial move by updating manifest metadata + // without rewriting data files. This is called when IsTrivialMove() is true. + // REQUIRES: mutex held + // Returns: Status of the trivial move operation + Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer, + bool& compaction_released, size_t& moved_files, + size_t& moved_bytes); + // REQUIRES: mutex unlocked void TrackOrUntrackFiles(const std::vector& existing_data_files, bool track); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 5e8838747..9f4d08e93 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1424,6 +1424,56 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, return s; } +Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer, + bool& compaction_released, + size_t& moved_files, size_t& moved_bytes) { + mutex_.AssertHeld(); + + ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n", + c.column_family_data()->GetName().c_str(), + static_cast(c.num_input_files(0)), c.output_level()); + + // Move files to the output level by editing the manifest + for (unsigned int l = 0; l < c.num_input_levels(); l++) { + if (c.level(l) == c.output_level()) { + continue; + } + for (size_t i = 0; i < c.num_input_files(l); i++) { + FileMetaData* f = c.input(l, i); + c.edit()->DeleteFile(c.level(l), f->fd.GetNumber()); + c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, + f->unique_id, f->compensated_range_deletion_size, + f->tail_size, f->user_defined_timestamps_persisted); + moved_bytes += static_cast(c.input(l, i)->fd.GetFileSize()); + ROCKS_LOG_BUFFER( + log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c.column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c.output_level(), f->fd.GetFileSize()); + } + moved_files += c.num_input_files(l); + } + + // Install the new version + const ReadOptions read_options(Env::IOActivity::kCompaction); + const WriteOptions write_options(Env::IOActivity::kCompaction); + Status status = versions_->LogAndApply( + c.column_family_data(), read_options, write_options, c.edit(), &mutex_, + directories_.GetDbDir(), /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + [&c, &compaction_released](const Status& s) { + c.ReleaseCompactionFiles(s); + compaction_released = true; + }); + + return status; +} + Status DBImpl::CompactFilesImpl( const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, @@ -1511,6 +1561,63 @@ Status DBImpl::CompactFilesImpl( // deletion compaction currently not allowed in CompactFiles. assert(!c->deletion_compaction()); + // Check if this can be a trivial move (metadata-only update) + // Similar to the logic in DBImpl::BackgroundCompaction + // Note: We disable trivial move when compaction_service is present because + // the service expects all compactions to go through CompactionJob for + // tracking + bool is_trivial_move = compact_options.allow_trivial_move && + c->IsTrivialMove() && + immutable_db_options().compaction_service == nullptr; + + if (is_trivial_move) { + // Perform trivial move: just update manifest without rewriting data + TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove"); + + bool compaction_released = false; + size_t moved_files = 0; + size_t moved_bytes = 0; + Status status = PerformTrivialMove( + *c.get(), log_buffer, compaction_released, moved_files, moved_bytes); + + if (status.ok()) { + InstallSuperVersionAndScheduleWork( + c->column_family_data(), job_context->superversion_contexts.data()); + + // Populate output file names for trivial move + if (output_file_names != nullptr) { + for (const auto& newf : c->edit()->GetNewFiles()) { + output_file_names->push_back(TableFileName( + c->immutable_options().cf_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Trivial move succeeded for %zu files, %zu bytes total\n", + c->column_family_data()->GetName().c_str(), moved_files, moved_bytes); + } else { + if (!compaction_released) { + c->ReleaseCompactionFiles(status); + } + ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n", + c->column_family_data()->GetName().c_str(), + status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + + c.reset(); + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + + return status; + } + + // Not a trivial move, proceed with full compaction InitSnapshotContext(job_context); std::unique_ptr::iterator> pending_outputs_inserted_elem( @@ -4074,35 +4181,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); - // Move files to next level - int32_t moved_files = 0; - int64_t moved_bytes = 0; - for (unsigned int l = 0; l < c->num_input_levels(); l++) { - if (c->level(l) == c->output_level()) { - continue; - } - for (size_t i = 0; i < c->num_input_files(l); i++) { - FileMetaData* f = c->input(l, i); - c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); - c->edit()->AddFile( - c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction, f->temperature, - f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->epoch_number, f->file_checksum, - f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size, f->tail_size, - f->user_defined_timestamps_persisted); - - ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", - c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), - c->output_level(), f->fd.GetFileSize()); - ++moved_files; - moved_bytes += f->fd.GetFileSize(); - } - } if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize && c->immutable_options().compaction_pri == kRoundRobin) { int start_level = c->start_level(); @@ -4113,14 +4191,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); } } - status = versions_->LogAndApply( - c->column_family_data(), read_options, write_options, c->edit(), - &mutex_, directories_.GetDbDir(), - /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, - [&c, &compaction_released](const Status& s) { - c->ReleaseCompactionFiles(s); - compaction_released = true; - }); + + // Perform the trivial move + size_t moved_files = 0; + size_t moved_bytes = 0; + status = PerformTrivialMove(*c.get(), log_buffer, compaction_released, + moved_files, moved_bytes); io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork( c->column_family_data(), job_context->superversion_contexts.data()); @@ -4135,8 +4211,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, << "total_files_size" << moved_bytes; } ROCKS_LOG_BUFFER( - log_buffer, - "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n", c->column_family_data()->GetName().c_str(), moved_files, c->output_level(), moved_bytes, status.ToString().c_str(), c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index aeca38ec2..5fbc3324d 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -2383,11 +2383,17 @@ struct CompactionOptions { // "default_write_temperature" Temperature output_temperature_override = Temperature::kUnknown; + // Option to optimize the manual compaction by enabling trivial move for non + // overlapping files. + // Default: false + bool allow_trivial_move; + CompactionOptions() : compression(kDisableCompressionOption), output_file_size_limit(std::numeric_limits::max()), max_subcompactions(0), - canceled(nullptr) {} + canceled(nullptr), + allow_trivial_move(false) {} }; // For level based compaction, we can configure if we want to skip/force diff --git a/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md new file mode 100644 index 000000000..4c52fc3ab --- /dev/null +++ b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md @@ -0,0 +1 @@ +Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.