delete db;
}
+// Helper function to generate zero-padded keys
+// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42"
+static std::string MakeKey(const std::string& prefix, int index) {
+ return prefix + (index < 10 ? "0" : "") + std::to_string(index);
+}
+
+TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.level_compaction_dynamic_level_bytes = false;
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ ASSERT_NE(db, nullptr);
+
+ // Create 3 non-overlapping files in L0
+ // File 1: keys [a00-a99]
+ for (int i = 0; i < 100; i++) {
+ std::string key = MakeKey("a", i);
+ ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // File 2: keys [b00-b99]
+ for (int i = 0; i < 100; i++) {
+ std::string key = MakeKey("b", i);
+ ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // File 3: keys [c00-c99]
+ for (int i = 0; i < 100; i++) {
+ std::string key = MakeKey("c", i);
+ ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Verify files are in L0
+ ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[0].files.size(), 3);
+ ASSERT_EQ(meta.levels[1].files.size(), 0);
+
+ // Get L0 files
+ std::vector<std::string> l0_files;
+ for (const auto& file : meta.levels[0].files) {
+ l0_files.push_back(file.db_path + "/" + file.name);
+ }
+
+ CompactionOptions compact_option;
+ compact_option.allow_trivial_move = true;
+ // Compact all L0 files to L1 (non-overlapping in L1)
+ ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1));
+
+ // Verify files are now in L1
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[0].files.size(), 0);
+ ASSERT_EQ(meta.levels[1].files.size(), 3);
+
+ // Get the first file from L1 (should be the one with keys a00-a99)
+ std::string l1_file_to_move;
+ std::vector<std::string> l1_files_to_move_later;
+ uint64_t l1_file_number = 0;
+ for (const auto& file : meta.levels[1].files) {
+ if (file.smallestkey[0] == 'a') {
+ l1_file_to_move = file.db_path + "/" + file.name;
+ l1_file_number = file.file_number;
+ } else {
+ l1_files_to_move_later.push_back(file.db_path + "/" + file.name);
+ }
+ }
+ ASSERT_FALSE(l1_file_to_move.empty());
+
+ // Set up sync point to verify trivial move path is taken
+ bool trivial_move_executed = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::CompactFilesImpl:TrivialMove",
+ [&](void* /*arg*/) { trivial_move_executed = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Move the file from L1 to L6 - this should be a trivial move
+ // because the file doesn't overlap with anything in L6
+ std::vector<std::string> files_to_move = {l1_file_to_move};
+ ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6));
+
+ // Verify trivial move was executed
+ ASSERT_TRUE(trivial_move_executed);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Verify the file is now in L6
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[1].files.size(), 2); // Two files remain in L1
+ ASSERT_EQ(meta.levels[6].files.size(), 1); // One file in L6
+
+ // Verify it's the correct file in L6
+ bool found_file_in_l6 = false;
+ for (const auto& file : meta.levels[6].files) {
+ if (file.file_number == l1_file_number) {
+ found_file_in_l6 = true;
+ // Verify key range hasn't changed
+ ASSERT_EQ(file.smallestkey[0], 'a');
+ ASSERT_EQ(file.largestkey[0], 'a');
+ break;
+ }
+ }
+ ASSERT_TRUE(found_file_in_l6);
+
+ // Move the other 2 files from L1 to L6, with allow_trivial_move set to false.
+ // This will trigger a normal compaction, so the 2 files will be compacted
+ // into a single file in L6.
+ ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6));
+
+ // Verify files in L6
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[1].files.size(), 0); // Zero files remain in L1
+ ASSERT_EQ(meta.levels[6].files.size(), 2); // Two file in L6
+
+ // Verify data integrity - all keys should still be readable
+ for (int i = 0; i < 100; i++) {
+ std::string key = MakeKey("a", i);
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), key, &value));
+ ASSERT_EQ(value, "value_" + key);
+ }
+
+ delete db;
+}
+
+TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
+ Options options;
+ options.create_if_missing = true;
+ options.disable_auto_compactions = true;
+ options.compression = kNoCompression;
+ options.level_compaction_dynamic_level_bytes = false;
+ options.num_levels = 7;
+
+ DB* db = nullptr;
+ ASSERT_OK(DestroyDB(db_name_, options));
+ Status s = DB::Open(options, db_name_, &db);
+ ASSERT_OK(s);
+ ASSERT_NE(db, nullptr);
+
+ // Create a file in L6 with keys [m00-m99] (wide range)
+ for (int i = 0; i < 100; i++) {
+ std::string key = MakeKey("m", i);
+ ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Get L0 file
+ ColumnFamilyMetaData meta;
+ db->GetColumnFamilyMetaData(&meta);
+ std::vector<std::string> l0_files;
+ for (const auto& file : meta.levels[0].files) {
+ l0_files.push_back(file.db_path + "/" + file.name);
+ }
+
+ CompactionOptions compact_option;
+ compact_option.allow_trivial_move = true;
+
+ // Move to L6
+ ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6));
+
+ // Now create a file in L1 with overlapping keys [m50-m60]
+ for (int i = 50; i <= 60; i++) {
+ std::string key = "m" + std::to_string(i);
+ ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key));
+ }
+ ASSERT_OK(db->Flush(FlushOptions()));
+
+ // Get the L0 file
+ db->GetColumnFamilyMetaData(&meta);
+ std::vector<std::string> l0_files_2;
+ for (const auto& file : meta.levels[0].files) {
+ l0_files_2.push_back(file.db_path + "/" + file.name);
+ }
+
+ // Move to L1
+ ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1));
+
+ // Get the L1 file
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[1].files.size(), 1);
+ std::string l1_file =
+ meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name;
+
+ // Set up sync point to verify full compaction path is taken
+ bool trivial_move_executed = false;
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImpl::CompactFilesImpl:TrivialMove",
+ [&](void* /*arg*/) { trivial_move_executed = true; });
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Try to move from L1 to L6 - this should NOT be a trivial move
+ // because the file overlaps with the existing file in L6
+ ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6));
+
+ // Verify trivial move was NOT executed (full compaction happened)
+ ASSERT_FALSE(trivial_move_executed);
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+
+ // Verify the result - should have merged data in L6
+ db->GetColumnFamilyMetaData(&meta);
+ ASSERT_EQ(meta.levels[1].files.size(), 0); // L1 should be empty
+ // L6 should have the merged file (may be 1 file if merged, or 2 if not)
+ ASSERT_GE(meta.levels[6].files.size(), 1);
+
+ // Verify updated values are present
+ for (int i = 50; i <= 60; i++) {
+ std::string key = "m" + std::to_string(i);
+ std::string value;
+ ASSERT_OK(db->Get(ReadOptions(), key, &value));
+ ASSERT_EQ(value, "updated_value_" + key);
+ }
+
+ delete db;
+}
+
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
JobContext* job_context, LogBuffer* log_buffer,
CompactionJobInfo* compaction_job_info);
+ // Helper function to perform trivial move by updating manifest metadata
+ // without rewriting data files. This is called when IsTrivialMove() is true.
+ // REQUIRES: mutex held
+ // Returns: Status of the trivial move operation
+ Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+ bool& compaction_released, size_t& moved_files,
+ size_t& moved_bytes);
+
// REQUIRES: mutex unlocked
void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
bool track);
return s;
}
+Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+ bool& compaction_released,
+ size_t& moved_files, size_t& moved_bytes) {
+ mutex_.AssertHeld();
+
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n",
+ c.column_family_data()->GetName().c_str(),
+ static_cast<int>(c.num_input_files(0)), c.output_level());
+
+ // Move files to the output level by editing the manifest
+ for (unsigned int l = 0; l < c.num_input_levels(); l++) {
+ if (c.level(l) == c.output_level()) {
+ continue;
+ }
+ for (size_t i = 0; i < c.num_input_files(l); i++) {
+ FileMetaData* f = c.input(l, i);
+ c.edit()->DeleteFile(c.level(l), f->fd.GetNumber());
+ c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->temperature,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->epoch_number,
+ f->file_checksum, f->file_checksum_func_name,
+ f->unique_id, f->compensated_range_deletion_size,
+ f->tail_size, f->user_defined_timestamps_persisted);
+ moved_bytes += static_cast<size_t>(c.input(l, i)->fd.GetFileSize());
+ ROCKS_LOG_BUFFER(
+ log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+ c.column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+ c.output_level(), f->fd.GetFileSize());
+ }
+ moved_files += c.num_input_files(l);
+ }
+
+ // Install the new version
+ const ReadOptions read_options(Env::IOActivity::kCompaction);
+ const WriteOptions write_options(Env::IOActivity::kCompaction);
+ Status status = versions_->LogAndApply(
+ c.column_family_data(), read_options, write_options, c.edit(), &mutex_,
+ directories_.GetDbDir(), /*new_descriptor_log=*/false,
+ /*column_family_options=*/nullptr,
+ [&c, &compaction_released](const Status& s) {
+ c.ReleaseCompactionFiles(s);
+ compaction_released = true;
+ });
+
+ return status;
+}
+
Status DBImpl::CompactFilesImpl(
const CompactionOptions& compact_options, ColumnFamilyData* cfd,
Version* version, const std::vector<std::string>& input_file_names,
// deletion compaction currently not allowed in CompactFiles.
assert(!c->deletion_compaction());
+ // Check if this can be a trivial move (metadata-only update)
+ // Similar to the logic in DBImpl::BackgroundCompaction
+ // Note: We disable trivial move when compaction_service is present because
+ // the service expects all compactions to go through CompactionJob for
+ // tracking
+ bool is_trivial_move = compact_options.allow_trivial_move &&
+ c->IsTrivialMove() &&
+ immutable_db_options().compaction_service == nullptr;
+
+ if (is_trivial_move) {
+ // Perform trivial move: just update manifest without rewriting data
+ TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove");
+
+ bool compaction_released = false;
+ size_t moved_files = 0;
+ size_t moved_bytes = 0;
+ Status status = PerformTrivialMove(
+ *c.get(), log_buffer, compaction_released, moved_files, moved_bytes);
+
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(
+ c->column_family_data(), job_context->superversion_contexts.data());
+
+ // Populate output file names for trivial move
+ if (output_file_names != nullptr) {
+ for (const auto& newf : c->edit()->GetNewFiles()) {
+ output_file_names->push_back(TableFileName(
+ c->immutable_options().cf_paths, newf.second.fd.GetNumber(),
+ newf.second.fd.GetPathId()));
+ }
+ }
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Trivial move succeeded for %zu files, %zu bytes total\n",
+ c->column_family_data()->GetName().c_str(), moved_files, moved_bytes);
+ } else {
+ if (!compaction_released) {
+ c->ReleaseCompactionFiles(status);
+ }
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n",
+ c->column_family_data()->GetName().c_str(),
+ status.ToString().c_str());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ }
+
+ c.reset();
+ bg_compaction_scheduled_--;
+ if (bg_compaction_scheduled_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ MaybeScheduleFlushOrCompaction();
+
+ return status;
+ }
+
+ // Not a trivial move, proceed with full compaction
InitSnapshotContext(job_context);
std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
compaction_job_stats, job_context->job_id);
- // Move files to next level
- int32_t moved_files = 0;
- int64_t moved_bytes = 0;
- for (unsigned int l = 0; l < c->num_input_levels(); l++) {
- if (c->level(l) == c->output_level()) {
- continue;
- }
- for (size_t i = 0; i < c->num_input_files(l); i++) {
- FileMetaData* f = c->input(l, i);
- c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
- c->edit()->AddFile(
- c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
- f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
- f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
- f->oldest_blob_file_number, f->oldest_ancester_time,
- f->file_creation_time, f->epoch_number, f->file_checksum,
- f->file_checksum_func_name, f->unique_id,
- f->compensated_range_deletion_size, f->tail_size,
- f->user_defined_timestamps_persisted);
-
- ROCKS_LOG_BUFFER(
- log_buffer,
- "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
- c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
- c->output_level(), f->fd.GetFileSize());
- ++moved_files;
- moved_bytes += f->fd.GetFileSize();
- }
- }
if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
c->immutable_options().compaction_pri == kRoundRobin) {
int start_level = c->start_level();
vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
}
}
- status = versions_->LogAndApply(
- c->column_family_data(), read_options, write_options, c->edit(),
- &mutex_, directories_.GetDbDir(),
- /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
- [&c, &compaction_released](const Status& s) {
- c->ReleaseCompactionFiles(s);
- compaction_released = true;
- });
+
+ // Perform the trivial move
+ size_t moved_files = 0;
+ size_t moved_bytes = 0;
+ status = PerformTrivialMove(*c.get(), log_buffer, compaction_released,
+ moved_files, moved_bytes);
io_s = versions_->io_status();
InstallSuperVersionAndScheduleWork(
c->column_family_data(), job_context->superversion_contexts.data());
<< "total_files_size" << moved_bytes;
}
ROCKS_LOG_BUFFER(
- log_buffer,
- "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+ log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n",
c->column_family_data()->GetName().c_str(), moved_files,
c->output_level(), moved_bytes, status.ToString().c_str(),
c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
// "default_write_temperature"
Temperature output_temperature_override = Temperature::kUnknown;
+ // Option to optimize the manual compaction by enabling trivial move for non
+ // overlapping files.
+ // Default: false
+ bool allow_trivial_move;
+
CompactionOptions()
: compression(kDisableCompressionOption),
output_file_size_limit(std::numeric_limits<uint64_t>::max()),
max_subcompactions(0),
- canceled(nullptr) {}
+ canceled(nullptr),
+ allow_trivial_move(false) {}
};
// For level based compaction, we can configure if we want to skip/force
--- /dev/null
+Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.