## Unreleased
### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
+* Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file.
### Bug Fixes
* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
* Fixed a bug that multi-level FIFO compaction deletes one file in non-L0 even when `CompactionOptionsFIFO::max_table_files_size` is no exceeded since #10348 or 7.8.0.
* Fixed a bug caused by `DB::SyncWAL()` affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#10892).
* Fixed a BackupEngine bug in which RestoreDBFromLatestBackup would fail if the latest backup was deleted and there is another valid backup available.
+* Fix L0 file misorder corruption caused by ingesting files of overlapping seqnos with memtable entries' through introducing `epoch_number`. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected. Also replace the previous incomplete fix (#5958) to the same corruption with this new and more complete fix.
## 7.9.0 (11/21/2022)
### Performance Improvements
allow_2pc_(db_options.allow_2pc),
last_memtable_id_(0),
db_paths_registered_(false),
- mempurge_used_(false) {
+ mempurge_used_(false),
+ next_epoch_number_(1) {
if (id_ != kDummyColumnFamilyDataId) {
// TODO(cc): RegisterDbPaths can be expensive, considering moving it
// outside of this constructor which might be called with db mutex held.
Compaction* ColumnFamilyData::PickCompaction(
const MutableCFOptions& mutable_options,
const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
- SequenceNumber earliest_mem_seqno =
- std::min(mem_->GetEarliestSequenceNumber(),
- imm_.current()->GetEarliestSequenceNumber(false));
auto* result = compaction_picker_->PickCompaction(
GetName(), mutable_options, mutable_db_options, current_->storage_info(),
- log_buffer, earliest_mem_seqno);
+ log_buffer);
if (result != nullptr) {
result->SetInputVersion(current_);
}
return data_dirs_[path_id].get();
}
+void ColumnFamilyData::RecoverEpochNumbers() {
+ assert(current_);
+ auto* vstorage = current_->storage_info();
+ assert(vstorage);
+ vstorage->RecoverEpochNumbers(this);
+}
+
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options,
const FileOptions& file_options,
void SetMempurgeUsed() { mempurge_used_ = true; }
bool GetMempurgeUsed() { return mempurge_used_; }
+ // Allocate and return a new epoch number
+ uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); }
+
+ // Get the next epoch number to be assigned
+ uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); }
+
+ // Set the next epoch number to be assigned
+ void SetNextEpochNumber(uint64_t next_epoch_number) {
+ next_epoch_number_.store(next_epoch_number);
+ }
+
+ // Reset the next epoch number to be assigned
+ void ResetNextEpochNumber() { next_epoch_number_.store(1); }
+
+ // Recover the next epoch number of this CF and epoch number
+ // of its files (if missing)
+ void RecoverEpochNumbers();
+
private:
friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name,
// a Version associated with this CFD
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
bool mempurge_used_;
+
+ std::atomic<uint64_t> next_epoch_number_;
};
// ColumnFamilySet has interesting thread-safety requirements
return min_oldest_ancester_time;
}
+uint64_t Compaction::MinInputFileEpochNumber() const {
+ uint64_t min_epoch_number = std::numeric_limits<uint64_t>::max();
+ for (const auto& inputs_per_level : inputs_) {
+ for (const auto& file : inputs_per_level.files) {
+ min_epoch_number = std::min(min_epoch_number, file->epoch_number);
+ }
+ }
+ return min_epoch_number;
+}
+
int Compaction::EvaluatePenultimateLevel(
const VersionStorageInfo* vstorage,
const ImmutableOptions& immutable_options, const int start_level,
// This is used to filter out some input files' ancester's time range.
uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
const InternalKey* end) const;
+ // Return the minimum epoch number among
+ // input files' associated with this compaction
+ uint64_t MinInputFileEpochNumber() const;
// Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
// compaction begin and compaction completion callbacks match.
}
// Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+ uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber();
{
FileMetaData meta;
meta.fd = FileDescriptor(file_number,
sub_compact->compaction->output_path_id(), 0);
meta.oldest_ancester_time = oldest_ancester_time;
meta.file_creation_time = current_time;
+ meta.epoch_number = epoch_number;
meta.temperature = temperature;
assert(!db_id_.empty());
assert(!db_session_id_.empty());
std::string largest_internal_key;
uint64_t oldest_ancester_time;
uint64_t file_creation_time;
+ uint64_t epoch_number;
uint64_t paranoid_hash;
bool marked_for_compaction;
UniqueId64x2 unique_id;
const std::string& name, SequenceNumber smallest, SequenceNumber largest,
std::string _smallest_internal_key, std::string _largest_internal_key,
uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
- uint64_t _paranoid_hash, bool _marked_for_compaction,
- UniqueId64x2 _unique_id)
+ uint64_t _epoch_number, uint64_t _paranoid_hash,
+ bool _marked_for_compaction, UniqueId64x2 _unique_id)
: file_name(name),
smallest_seqno(smallest),
largest_seqno(largest),
largest_internal_key(std::move(_largest_internal_key)),
oldest_ancester_time(_oldest_ancester_time),
file_creation_time(_file_creation_time),
+ epoch_number(_epoch_number),
paranoid_hash(_paranoid_hash),
marked_for_compaction(_marked_for_compaction),
unique_id(std::move(_unique_id)) {}
}
VersionEdit edit;
- edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
- smallest_seqno, largest_seqno, false, Temperature::kUnknown,
- oldest_blob_file_number, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ edit.AddFile(
+ level, file_number, 0, file_size, smallest_key, largest_key,
+ smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+ oldest_blob_file_number, kUnknownOldestAncesterTime,
+ kUnknownFileCreationTime,
+ versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
mutex_.Lock();
EXPECT_OK(
rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
- rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+ rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
}
result.output_level = rnd.Uniform(10);
result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
size_t min_files_to_compact,
uint64_t max_compact_bytes_per_del_file,
uint64_t max_compaction_bytes,
- CompactionInputFiles* comp_inputs,
- SequenceNumber earliest_mem_seqno) {
- // Do not pick ingested file when there is at least one memtable not flushed
- // which of seqno is overlap with the sst.
+ CompactionInputFiles* comp_inputs) {
TEST_SYNC_POINT("FindIntraL0Compaction");
+
size_t start = 0;
- for (; start < level_files.size(); start++) {
- if (level_files[start]->being_compacted) {
- return false;
- }
- // If there is no data in memtable, the earliest sequence number would the
- // largest sequence number in last memtable.
- // Because all files are sorted in descending order by largest_seqno, so we
- // only need to check the first one.
- if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
- break;
- }
- }
- if (start >= level_files.size()) {
+
+ if (level_files.size() == 0 || level_files[start]->being_compacted) {
return false;
}
+
size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
// Compaction range will be [start, limit).
current_files[f].name +
" is currently being compacted.");
}
+
input_files->insert(TableFileNameToNumber(current_files[f].name));
}
virtual ~CompactionPicker();
// Pick level and inputs for a new compaction.
+ //
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
- virtual Compaction* PickCompaction(
- const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
- const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer,
- SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+ virtual Compaction* PickCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) = 0;
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// files. If it's not possible to conver an invalid input_files
// into a valid one by adding more files, the function will return a
// non-ok status with specific reason.
+//
#ifndef ROCKSDB_LITE
Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta,
virtual ~NullCompactionPicker() {}
// Always return "nullptr"
- Compaction* PickCompaction(
- const std::string& /*cf_name*/,
- const MutableCFOptions& /*mutable_cf_options*/,
- const MutableDBOptions& /*mutable_db_options*/,
- VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
- SequenceNumber /* earliest_memtable_seqno */) override {
+ Compaction* PickCompaction(const std::string& /*cf_name*/,
+ const MutableCFOptions& /*mutable_cf_options*/,
+ const MutableDBOptions& /*mutable_db_options*/,
+ VersionStorageInfo* /*vstorage*/,
+ LogBuffer* /* log_buffer */) override {
return nullptr;
}
// files. Cannot be nullptr.
//
// @return true iff compaction was found.
-bool FindIntraL0Compaction(
- const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
- uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
- CompactionInputFiles* comp_inputs,
- SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+ size_t min_files_to_compact,
+ uint64_t max_compact_bytes_per_del_file,
+ uint64_t max_compaction_bytes,
+ CompactionInputFiles* comp_inputs);
CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
const MutableCFOptions& mutable_cf_options,
Compaction* FIFOCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
+ LogBuffer* log_buffer) {
Compaction* c = nullptr;
if (mutable_cf_options.ttl > 0) {
c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {}
- virtual Compaction* PickCompaction(
- const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
- const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
- LogBuffer* log_buffer,
- SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual Compaction* PickCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* version,
+ LogBuffer* log_buffer) override;
virtual Compaction* CompactRange(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
public:
LevelCompactionBuilder(const std::string& cf_name,
VersionStorageInfo* vstorage,
- SequenceNumber earliest_mem_seqno,
CompactionPicker* compaction_picker,
LogBuffer* log_buffer,
const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options)
: cf_name_(cf_name),
vstorage_(vstorage),
- earliest_mem_seqno_(earliest_mem_seqno),
compaction_picker_(compaction_picker),
log_buffer_(log_buffer),
mutable_cf_options_(mutable_cf_options),
const std::string& cf_name_;
VersionStorageInfo* vstorage_;
- SequenceNumber earliest_mem_seqno_;
CompactionPicker* compaction_picker_;
LogBuffer* log_buffer_;
int start_level_ = -1;
}
output_level_ =
(start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
- if (PickFileToCompact()) {
+ bool picked_file_to_compact = PickFileToCompact();
+ TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact",
+ &picked_file_to_compact);
+ if (picked_file_to_compact) {
// found the compaction!
if (start_level_ == 0) {
// L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
std::numeric_limits<uint64_t>::max(),
mutable_cf_options_.max_compaction_bytes,
- &start_level_inputs_, earliest_mem_seqno_);
+ &start_level_inputs_);
}
} // namespace
Compaction* LevelCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
- LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
- log_buffer, mutable_cf_options, ioptions_,
+ LogBuffer* log_buffer) {
+ LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
+ mutable_cf_options, ioptions_,
mutable_db_options);
return builder.PickCompaction();
}
LevelCompactionPicker(const ImmutableOptions& ioptions,
const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {}
- virtual Compaction* PickCompaction(
- const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
- const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer,
- SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual Compaction* PickCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) override;
virtual bool NeedsCompaction(
const VersionStorageInfo* vstorage) const override;
void NewVersionStorage(int num_levels, CompactionStyle style) {
DeleteVersionStorage();
options_.num_levels = num_levels;
- vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
- style, nullptr, false));
+ vstorage_.reset(new VersionStorageInfo(
+ &icmp_, ucmp_, options_.num_levels, style, nullptr, false,
+ EpochNumberRequirement::kMustPresent));
vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
}
void AddVersionStorage() {
temp_vstorage_.reset(new VersionStorageInfo(
&icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
- vstorage_.get(), false));
+ vstorage_.get(), false, EpochNumberRequirement::kMustPresent));
}
void DeleteVersionStorage() {
size_t compensated_file_size = 0, bool marked_for_compact = false,
Temperature temperature = Temperature::kUnknown,
uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime,
- Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) {
+ Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice(),
+ uint64_t epoch_number = kUnknownEpochNumber) {
assert(ts_of_smallest.size() == ucmp_->timestamp_size());
assert(ts_of_largest.size() == ucmp_->timestamp_size());
file_number, path_id, file_size, smallest_ikey, largest_ikey,
smallest_seq, largest_seq, marked_for_compact, temperature,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
f->compensated_file_size =
(compensated_file_size != 0) ? compensated_file_size : file_size;
ASSERT_EQ(0, compaction->output_level());
}
-TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
- // Intra L0 compaction triggers only if there are at least
- // level0_file_num_compaction_trigger + 2 L0 files.
- mutable_cf_options_.level0_file_num_compaction_trigger = 3;
- mutable_cf_options_.max_compaction_bytes = 999999u;
- NewVersionStorage(6, kCompactionStyleLevel);
-
- // 4 out of 6 L0 files will be picked for intra L0 compaction due to
- // being_compact limit. And the latest one L0 will be skipped due to earliest
- // seqno. The one L1 file spans entire L0 key range and is marked as being
- // compacted to avoid L0->L1 compaction.
- Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
- Add(0, 2U, "301", "350", 1U, 0, 108, 109);
- Add(0, 3U, "251", "300", 1U, 0, 106, 107);
- Add(0, 4U, "201", "250", 1U, 0, 104, 105);
- Add(0, 5U, "151", "200", 1U, 0, 102, 103);
- Add(0, 6U, "100", "150", 1U, 0, 100, 101);
- Add(0, 7U, "100", "100", 1U, 0, 99, 100);
- vstorage_->LevelFiles(0)[5]->being_compacted = true;
- vstorage_->LevelFiles(1)[0]->being_compacted = true;
- UpdateVersionStorageInfo();
-
- std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
- cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
- &log_buffer_, 107));
- ASSERT_TRUE(compaction.get() != nullptr);
- ASSERT_EQ(1U, compaction->num_input_levels());
- ASSERT_EQ(4U, compaction->num_input_files(0));
- ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
- compaction->compaction_reason());
- ASSERT_EQ(0, compaction->output_level());
-}
#ifndef ROCKSDB_LITE
TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
// should fail
NewVersionStorage(5, kCompactionStyleUniversal);
- Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
- Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
- Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0,
+ /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 3);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 2);
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 1);
Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
AddVersionStorage();
// Simulate a flush and mark the file for compaction
- Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+ Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 4);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction2(
NewVersionStorage(5, kCompactionStyleUniversal);
// Mark file number 4 for compaction
- Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+ Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 1);
Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
ASSERT_EQ(1U, compaction->num_input_files(1));
AddVersionStorage();
- Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
- Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0,
+ /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 3);
+ Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 2);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction2(
NewVersionStorage(1, kCompactionStyleUniversal);
// Mark file number 5 for compaction
- Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
- Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
- Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
- Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+ Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 4);
+ Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 3);
+ Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 2);
+ Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100,
+ /*compensated_file_size*/ 0, /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 1);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(
ASSERT_TRUE(file_map_[6].first->being_compacted);
AddVersionStorage();
- Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
- Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+ Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0,
+ /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 6);
+ Add(0, 2U, "201", "250", kFileSize, 0, 401, 450, /*compensated_file_size*/ 0,
+ /*marked_for_compact*/ false,
+ /* temperature*/ Temperature::kUnknown,
+ /*oldest_ancestor_time*/ kUnknownOldestAncesterTime,
+ /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(),
+ /*epoch_number*/ 5);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction2(
Compaction* UniversalCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
+ LogBuffer* log_buffer) {
UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
mutable_cf_options, mutable_db_options,
vstorage, this, log_buffer);
if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
// Always need to do a full compaction for periodic compaction.
c = PickPeriodicCompaction();
+ TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c);
}
// Check for size amplification.
static_cast<size_t>(
mutable_cf_options_.level0_file_num_compaction_trigger)) {
if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+ TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr");
ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
cf_name_.c_str());
} else {
mutable_cf_options_.compaction_options_universal.size_ratio;
if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+ TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr");
ROCKS_LOG_BUFFER(log_buffer_,
"[%s] Universal: compacting for size ratio\n",
cf_name_.c_str());
if (c == nullptr) {
if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+ TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr");
ROCKS_LOG_BUFFER(log_buffer_,
"[%s] Universal: delete triggered compaction\n",
cf_name_.c_str());
UniversalCompactionPicker(const ImmutableOptions& ioptions,
const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {}
- virtual Compaction* PickCompaction(
- const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
- const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
- LogBuffer* log_buffer,
- SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+ virtual Compaction* PickCompaction(const std::string& cf_name,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ VersionStorageInfo* vstorage,
+ LogBuffer* log_buffer) override;
virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
virtual bool NeedsCompaction(
meta.largest.DecodeFrom(file.largest_internal_key);
meta.oldest_ancester_time = file.oldest_ancester_time;
meta.file_creation_time = file.file_creation_time;
+ meta.epoch_number = file.epoch_number;
meta.marked_for_compaction = file.marked_for_compaction;
meta.unique_id = file.unique_id;
MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
meta.largest.Encode().ToString(), meta.oldest_ancester_time,
- meta.file_creation_time, output_file.validator.GetHash(),
- meta.marked_for_compaction, meta.unique_id);
+ meta.file_creation_time, meta.epoch_number,
+ output_file.validator.GetHash(), meta.marked_for_compaction,
+ meta.unique_id);
}
InternalStats::CompactionStatsFull compaction_stats;
sub_compact->AggregateCompactionStats(compaction_stats);
{offsetof(struct CompactionServiceOutputFile, file_creation_time),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
+ {"epoch_number",
+ {offsetof(struct CompactionServiceOutputFile, epoch_number),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
{"paranoid_hash",
{offsetof(struct CompactionServiceOutputFile, paranoid_hash),
OptionType::kUInt64T, OptionVerificationType::kNormal,
#include <tuple>
+#include "compaction/compaction_picker_universal.h"
#include "db/blob/blob_index.h"
#include "db/db_test_util.h"
+#include "db/dbformat.h"
#include "env/mock_env.h"
#include "port/port.h"
#include "port/stack_trace.h"
ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
}
-TEST_P(DBCompactionTestWithParam,
- FlushAfterIntraL0CompactionCheckConsistencyFail) {
- Options options = CurrentOptions();
- options.force_consistency_checks = true;
- options.compression = kNoCompression;
- options.level0_file_num_compaction_trigger = 5;
- options.max_background_compactions = 2;
- options.max_subcompactions = max_subcompactions_;
- DestroyAndReopen(options);
+class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
+ public:
+ DBCompactionTestL0FilesMisorderCorruption() : DBCompactionTest() {}
+ void SetupOptions(const CompactionStyle compaciton_style,
+ const std::string& compaction_path_to_test = "") {
+ options_ = CurrentOptions();
+ options_.create_if_missing = true;
+ options_.compression = kNoCompression;
+
+ options_.force_consistency_checks = true;
+ options_.compaction_style = compaciton_style;
+
+ if (compaciton_style == CompactionStyle::kCompactionStyleLevel) {
+ options_.num_levels = 7;
+ // Level compaction's PickIntraL0Compaction() impl detail requires
+ // `options.level0_file_num_compaction_trigger` to be
+ // at least 2 files less than the actual number of level 0 files
+ // (i.e, 7 by design in this test)
+ options_.level0_file_num_compaction_trigger = 5;
+ options_.max_background_compactions = 2;
+ options_.write_buffer_size = 2 << 20;
+ options_.max_write_buffer_number = 6;
+ } else if (compaciton_style == CompactionStyle::kCompactionStyleUniversal) {
+ // TODO: expand test coverage to num_lvels > 1 for universal compacion,
+ // which requires careful unit test design to compact to level 0 despite
+ // num_levels > 1
+ options_.num_levels = 1;
+ options_.level0_file_num_compaction_trigger = 5;
+
+ CompactionOptionsUniversal universal_options;
+ if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") {
+ universal_options.max_size_amplification_percent = 50;
+ } else if (compaction_path_to_test ==
+ "PickCompactionToReduceSortedRuns") {
+ universal_options.max_size_amplification_percent = 400;
+ } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
+ universal_options.max_size_amplification_percent = 400;
+ universal_options.min_merge_width = 6;
+ }
+ options_.compaction_options_universal = universal_options;
+ } else if (compaciton_style == CompactionStyle::kCompactionStyleFIFO) {
+ options_.max_open_files = -1;
+ options_.num_levels = 1;
+ options_.level0_file_num_compaction_trigger = 3;
+
+ CompactionOptionsFIFO fifo_options;
+ if (compaction_path_to_test == "FindIntraL0Compaction" ||
+ compaction_path_to_test == "CompactRange") {
+ fifo_options.allow_compaction = true;
+ fifo_options.age_for_warm = 0;
+ } else if (compaction_path_to_test == "CompactFile") {
+ fifo_options.allow_compaction = false;
+ fifo_options.age_for_warm = 0;
+ }
+ options_.compaction_options_fifo = fifo_options;
+ }
- const size_t kValueSize = 1 << 20;
- Random rnd(301);
- std::atomic<int> pick_intra_l0_count(0);
- std::string value(rnd.RandomString(kValueSize));
+ if (compaction_path_to_test == "CompactFile" ||
+ compaction_path_to_test == "CompactRange") {
+ options_.disable_auto_compactions = true;
+ } else {
+ options_.disable_auto_compactions = false;
+ }
+ }
- // The L0->L1 must be picked before we begin ingesting files to trigger
- // intra-L0 compaction, and must not finish until after an intra-L0
- // compaction has been picked.
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
- {{"LevelCompactionPicker::PickCompaction:Return",
- "DBCompactionTestWithParam::"
- "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"},
- {"LevelCompactionPicker::PickCompactionBySize:0",
- "CompactionJob::Run():Start"}});
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
- "FindIntraL0Compaction",
- [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+ void Destroy(const Options& options) {
+ if (snapshot_) {
+ assert(db_);
+ db_->ReleaseSnapshot(snapshot_);
+ snapshot_ = nullptr;
+ }
+ DBTestBase::Destroy(options);
+ }
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ void Reopen(const Options& options) {
+ DBTestBase::Reopen(options);
+ if (options.compaction_style != CompactionStyle::kCompactionStyleLevel) {
+ // To force assigning the global seqno to ingested file
+ // for our test purpose.
+ assert(snapshot_ == nullptr);
+ snapshot_ = db_->GetSnapshot();
+ }
+ }
- // prevents trivial move
- for (int i = 0; i < 10; ++i) {
- ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ void DestroyAndReopen(Options& options) {
+ Destroy(options);
+ Reopen(options);
}
- ASSERT_OK(Flush());
- Compact("", Key(99));
- ASSERT_EQ(0, NumTableFilesAtLevel(0));
- // Flush 5 L0 sst.
- for (int i = 0; i < 5; ++i) {
- ASSERT_OK(Put(Key(i + 1), value));
- ASSERT_OK(Flush());
+ void PauseCompactionThread() {
+ sleeping_task_.reset(new test::SleepingBackgroundTask());
+ env_->SetBackgroundThreads(1, Env::LOW);
+ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+ sleeping_task_.get(), Env::Priority::LOW);
+ sleeping_task_->WaitUntilSleeping();
}
- ASSERT_EQ(5, NumTableFilesAtLevel(0));
- // Put one key, to make smallest log sequence number in this memtable is less
- // than sst which would be ingested in next step.
- ASSERT_OK(Put(Key(0), "a"));
+ void ResumeCompactionThread() {
+ if (sleeping_task_) {
+ sleeping_task_->WakeUp();
+ sleeping_task_->WaitUntilDone();
+ }
+ }
- ASSERT_EQ(5, NumTableFilesAtLevel(0));
- TEST_SYNC_POINT(
- "DBCompactionTestWithParam::"
- "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready");
+ void AddFilesMarkedForPeriodicCompaction(const size_t num_files) {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+ Version* const current = cfd->current();
+ assert(current);
- // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
- for (int i = 5; i < 10; i++) {
- ASSERT_EQ(i, NumTableFilesAtLevel(0));
- IngestOneKeyValue(dbfull(), Key(i), value, options);
+ VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+
+ const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
+ assert(level0_files.size() == num_files);
+
+ for (FileMetaData* f : level0_files) {
+ storage_info->TEST_AddFileMarkedForPeriodicCompaction(0, f);
+ }
}
- // Put one key, to make biggest log sequence number in this memtable is bigger
- // than sst which would be ingested in next step.
- ASSERT_OK(Put(Key(2), "b"));
- ASSERT_OK(dbfull()->TEST_WaitForCompact());
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
- std::vector<std::vector<FileMetaData>> level_to_files;
- dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
- &level_to_files);
- ASSERT_GT(level_to_files[0].size(), 0);
- ASSERT_GT(pick_intra_l0_count.load(), 0);
+ void AddFilesMarkedForCompaction(const size_t num_files) {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+ Version* const current = cfd->current();
+ assert(current);
- ASSERT_OK(Flush());
-}
+ VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
-TEST_P(DBCompactionTestWithParam,
- IntraL0CompactionAfterFlushCheckConsistencyFail) {
- Options options = CurrentOptions();
- options.force_consistency_checks = true;
- options.compression = kNoCompression;
- options.level0_file_num_compaction_trigger = 5;
- options.max_background_compactions = 2;
- options.max_subcompactions = max_subcompactions_;
- options.write_buffer_size = 2 << 20;
- options.max_write_buffer_number = 6;
- DestroyAndReopen(options);
+ const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
+ assert(level0_files.size() == num_files);
- const size_t kValueSize = 1 << 20;
- Random rnd(301);
- std::string value(rnd.RandomString(kValueSize));
- std::string value2(rnd.RandomString(kValueSize));
- std::string bigvalue = value + value;
+ for (FileMetaData* f : level0_files) {
+ storage_info->TEST_AddFileMarkedForCompaction(0, f);
+ }
+ }
- // prevents trivial move
+ void SetupSyncPoints(const std::string& compaction_path_to_test) {
+ compaction_path_sync_point_called_.store(false);
+ if (compaction_path_to_test == "FindIntraL0Compaction" &&
+ options_.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PostPickFileToCompact", [&](void* arg) {
+ bool* picked_file_to_compact = (bool*)arg;
+ // To trigger intra-L0 compaction specifically,
+ // we mock PickFileToCompact()'s result to be false
+ *picked_file_to_compact = false;
+ });
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction", [&](void* /*arg*/) {
+ compaction_path_sync_point_called_.store(true);
+ });
+
+ } else if (compaction_path_to_test == "PickPeriodicCompaction") {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PostPickPeriodicCompaction", [&](void* compaction_arg) {
+ Compaction* compaction = (Compaction*)compaction_arg;
+ if (compaction != nullptr) {
+ compaction_path_sync_point_called_.store(true);
+ }
+ });
+ } else if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PickCompactionToReduceSizeAmpReturnNonnullptr", [&](void* /*arg*/) {
+ compaction_path_sync_point_called_.store(true);
+ });
+ } else if (compaction_path_to_test == "PickCompactionToReduceSortedRuns") {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PickCompactionToReduceSortedRunsReturnNonnullptr",
+ [&](void* /*arg*/) {
+ compaction_path_sync_point_called_.store(true);
+ });
+ } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
+ assert(options_.compaction_style ==
+ CompactionStyle::kCompactionStyleUniversal);
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) {
+ compaction_path_sync_point_called_.store(true);
+ });
+ } else if ((compaction_path_to_test == "FindIntraL0Compaction" ||
+ compaction_path_to_test == "CompactRange") &&
+ options_.compaction_style ==
+ CompactionStyle::kCompactionStyleFIFO) {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+ "FindIntraL0Compaction", [&](void* /*arg*/) {
+ compaction_path_sync_point_called_.store(true);
+ });
+ }
+
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+ }
+
+ bool SyncPointsCalled() { return compaction_path_sync_point_called_.load(); }
+
+ void DisableSyncPoints() {
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+ }
+
+ // Return the largest seqno of the latest L0 file based on file number
+ SequenceNumber GetLatestL0FileLargestSeqnoHelper() {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+ assert(cfd);
+ Version* const current = cfd->current();
+ assert(current);
+ VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+ const std::vector<FileMetaData*> level0_files = storage_info->LevelFiles(0);
+ assert(level0_files.size() >= 1);
+
+ uint64_t latest_file_num = 0;
+ uint64_t latest_file_largest_seqno = 0;
+ for (FileMetaData* f : level0_files) {
+ if (f->fd.GetNumber() > latest_file_num) {
+ latest_file_num = f->fd.GetNumber();
+ latest_file_largest_seqno = f->fd.largest_seqno;
+ }
+ }
+
+ return latest_file_largest_seqno;
+ }
+
+ protected:
+ Options options_;
+
+ private:
+ const Snapshot* snapshot_ = nullptr;
+ std::atomic<bool> compaction_path_sync_point_called_;
+ std::shared_ptr<test::SleepingBackgroundTask> sleeping_task_;
+};
+
+TEST_F(DBCompactionTestL0FilesMisorderCorruption,
+ FlushAfterIntraL0LevelCompactionWithIngestedFile) {
+ SetupOptions(CompactionStyle::kCompactionStyleLevel, "");
+ DestroyAndReopen(options_);
+ // Prevents trivial move
for (int i = 0; i < 10; ++i) {
- ASSERT_OK(Put(Key(i), "")); // prevents trivial move
+ ASSERT_OK(Put(Key(i), "")); // Prevents trivial move
}
ASSERT_OK(Flush());
Compact("", Key(99));
ASSERT_EQ(0, NumTableFilesAtLevel(0));
- std::atomic<int> pick_intra_l0_count(0);
- // The L0->L1 must be picked before we begin ingesting files to trigger
- // intra-L0 compaction, and must not finish until after an intra-L0
- // compaction has been picked.
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
- {{"LevelCompactionPicker::PickCompaction:Return",
- "DBCompactionTestWithParam::"
- "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"},
- {"LevelCompactionPicker::PickCompactionBySize:0",
- "CompactionJob::Run():Start"}});
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
- "FindIntraL0Compaction",
- [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
- // Make 6 L0 sst.
+ // To get accurate NumTableFilesAtLevel(0) when the number reaches
+ // options_.level0_file_num_compaction_trigger
+ PauseCompactionThread();
+
+ // To create below LSM tree
+ // (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
+ //
+ // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7]
+ // L0: s6[6:new@13], s5[5:old@6] ... s1[1:old@2],s0[0:old@1]
+ //
+ // (1) Make 6 L0 sst (i.e, s0 - s5)
for (int i = 0; i < 6; ++i) {
if (i % 2 == 0) {
- IngestOneKeyValue(dbfull(), Key(i), value, options);
+ IngestOneKeyValue(dbfull(), Key(i), "old", options_);
} else {
- ASSERT_OK(Put(Key(i), value));
+ ASSERT_OK(Put(Key(i), "old"));
ASSERT_OK(Flush());
}
}
-
ASSERT_EQ(6, NumTableFilesAtLevel(0));
- // Stop run flush job
- env_->SetBackgroundThreads(1, Env::HIGH);
- test::SleepingBackgroundTask sleeping_tasks;
- env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
- Env::Priority::HIGH);
- sleeping_tasks.WaitUntilSleeping();
-
- // Put many keys to make memtable request to flush
+ // (2) Create m1
for (int i = 0; i < 6; ++i) {
- ASSERT_OK(Put(Key(i), bigvalue));
+ ASSERT_OK(Put(Key(i), "new"));
}
-
ASSERT_EQ(6, NumTableFilesAtLevel(0));
- TEST_SYNC_POINT(
- "DBCompactionTestWithParam::"
- "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready");
- // ingest file to trigger IntraL0Compaction
- for (int i = 6; i < 10; ++i) {
+
+ // (3) Ingest file (i.e, s6) to trigger IntraL0Compaction()
+ for (int i = 6; i < 7; ++i) {
ASSERT_EQ(i, NumTableFilesAtLevel(0));
- IngestOneKeyValue(dbfull(), Key(i), value2, options);
+ IngestOneKeyValue(dbfull(), Key(i), "new", options_);
}
- // Wake up flush job
- sleeping_tasks.WakeUp();
- sleeping_tasks.WaitUntilDone();
+ SetupSyncPoints("FindIntraL0Compaction");
+ ResumeCompactionThread();
+
ASSERT_OK(dbfull()->TEST_WaitForCompact());
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
- uint64_t error_count = 0;
- db_->GetIntProperty("rocksdb.background-errors", &error_count);
- ASSERT_EQ(error_count, 0);
- ASSERT_GT(pick_intra_l0_count.load(), 0);
+ ASSERT_TRUE(SyncPointsCalled());
+ DisableSyncPoints();
+
+ // After compaction, we have LSM tree:
+ //
+ // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7]
+ // L0: s7[6:new@13, 5:old@6 .. 0:old@1]
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ SequenceNumber compact_output_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ ASSERT_OK(Flush());
+ // After flush, we have LSM tree:
+ //
+ // L0: s8[5:new@12 .. 0:new@7],s7[6:new@13, 5:old@5 .. 0:old@1]
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ SequenceNumber flushed_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ // To verify there isn't any file misorder leading to returning a old value
+ // of Key(0) - Key(5) , which is caused by flushed table s8 has a
+ // smaller largest seqno than the compaction output file s7's largest seqno
+ // while the flushed table has the newer version of the values than the
+ // compaction output file's.
+ ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
for (int i = 0; i < 6; ++i) {
- ASSERT_EQ(bigvalue, Get(Key(i)));
+ ASSERT_EQ("new", Get(Key(i)));
+ }
+ for (int i = 6; i < 7; ++i) {
+ ASSERT_EQ("new", Get(Key(i)));
+ }
+}
+
+TEST_F(DBCompactionTestL0FilesMisorderCorruption,
+ FlushAfterIntraL0UniversalCompactionWithIngestedFile) {
+ for (const std::string compaction_path_to_test :
+ {"PickPeriodicCompaction", "PickCompactionToReduceSizeAmp",
+ "PickCompactionToReduceSortedRuns", "PickDeleteTriggeredCompaction"}) {
+ SetupOptions(CompactionStyle::kCompactionStyleUniversal,
+ compaction_path_to_test);
+ DestroyAndReopen(options_);
+
+ // To get accurate NumTableFilesAtLevel(0) when the number reaches
+ // options_.level0_file_num_compaction_trigger
+ PauseCompactionThread();
+
+ // To create below LSM tree
+ // (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
+ //
+ // memtable: m1 [ k2:new@8, k1:new@7]
+ // L0: s4[k9:dummy@10], s3[k8:dummy@9],
+ // s2[k7:old@6, k6:old@5].. s0[k3:old@2, k1:old@1]
+ //
+ // (1) Create 3 existing SST file (i.e, s0 - s2)
+ ASSERT_OK(Put("k1", "old"));
+ ASSERT_OK(Put("k3", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ ASSERT_OK(Put("k4", "old"));
+ ASSERT_OK(Put("k5", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ ASSERT_OK(Put("k6", "old"));
+ ASSERT_OK(Put("k7", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ // (2) Create m1. Noted that it contains a overlaped key with s0
+ ASSERT_OK(Put("k1", "new")); // overlapped key
+ ASSERT_OK(Put("k2", "new"));
+
+ // (3) Ingest two SST files s3, s4
+ IngestOneKeyValue(dbfull(), "k8", "dummy", options_);
+ IngestOneKeyValue(dbfull(), "k9", "dummy", options_);
+ // Up to now, L0 contains s0 - s4
+ ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+ if (compaction_path_to_test == "PickPeriodicCompaction") {
+ AddFilesMarkedForPeriodicCompaction(5);
+ } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") {
+ AddFilesMarkedForCompaction(5);
+ }
+
+ SetupSyncPoints(compaction_path_to_test);
+ ResumeCompactionThread();
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_TRUE(SyncPointsCalled())
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ DisableSyncPoints();
+
+ // After compaction, we have LSM tree:
+ //
+ // memtable: m1[ k2:new@8, k1:new@7]
+ // L0: s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1]
+ ASSERT_EQ(1, NumTableFilesAtLevel(0))
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ SequenceNumber compact_output_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ ASSERT_OK(Flush()) << "failed for compaction path to test: "
+ << compaction_path_to_test;
+ // After flush, we have LSM tree:
+ //
+ // L0: s6[k2:new@8, k1:new@7],
+ // s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1]
+ ASSERT_EQ(2, NumTableFilesAtLevel(0))
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ SequenceNumber flushed_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ // To verify there isn't any file misorder leading to returning a old
+ // value of "k1" , which is caused by flushed table s6 has a
+ // smaller largest seqno than the compaction output file s5's largest seqno
+ // while the flushed table has the newer version of the value
+ // than the compaction output file's.
+ ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno)
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ EXPECT_EQ(Get("k1"), "new")
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ }
+
+ Destroy(options_);
+}
+
+TEST_F(DBCompactionTestL0FilesMisorderCorruption,
+ FlushAfterIntraL0FIFOCompactionWithIngestedFile) {
+ for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) {
+ SetupOptions(CompactionStyle::kCompactionStyleFIFO,
+ compaction_path_to_test);
+ DestroyAndReopen(options_);
+
+ // To create below LSM tree
+ // (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
+ //
+ // (1) Create an existing SST file s0
+ ASSERT_OK(Put("k1", "old"));
+ ASSERT_OK(Put("k3", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // (2) Create memtable m1. Noted that it contains a overlaped key with s0
+ ASSERT_OK(Put("k1", "new")); // overlapped key
+ ASSERT_OK(Put("k2", "new"));
+
+ // To get accurate NumTableFilesAtLevel(0) when the number reaches
+ // options_.level0_file_num_compaction_trigger
+ PauseCompactionThread();
+
+ // (3) Ingest two SST files s1, s2
+ IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
+ IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
+ // Up to now, L0 contains s0, s1, s2
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ SetupSyncPoints(compaction_path_to_test);
+ ResumeCompactionThread();
+
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_TRUE(SyncPointsCalled())
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ DisableSyncPoints();
+ // After compaction, we have LSM tree:
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
+ ASSERT_EQ(1, NumTableFilesAtLevel(0))
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ SequenceNumber compact_output_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ ASSERT_OK(Flush()) << "failed for compaction path to test: "
+ << compaction_path_to_test;
+ // After flush, we have LSM tree:
+ //
+ // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
+ // k1:old@1]
+ ASSERT_EQ(2, NumTableFilesAtLevel(0))
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ SequenceNumber flushed_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ // To verify there isn't any file misorder leading to returning a old
+ // value of "k1" , which is caused by flushed table s4 has a
+ // smaller largest seqno than the compaction output file s3's largest seqno
+ // while the flushed table has the newer version of the value
+ // than the compaction output file's.
+ ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno)
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ EXPECT_EQ(Get("k1"), "new")
+ << "failed for compaction path to test: " << compaction_path_to_test;
+ }
+
+ Destroy(options_);
+}
+
+class DBCompactionTestL0FilesMisorderCorruptionWithParam
+ : public DBCompactionTestL0FilesMisorderCorruption,
+ public testing::WithParamInterface<CompactionStyle> {
+ public:
+ DBCompactionTestL0FilesMisorderCorruptionWithParam()
+ : DBCompactionTestL0FilesMisorderCorruption() {}
+};
+
+// TODO: add `CompactionStyle::kCompactionStyleLevel` to testing parameter,
+// which requires careful unit test
+// design for ingesting file to L0 and CompactRange()/CompactFile() to L0
+INSTANTIATE_TEST_CASE_P(
+ DBCompactionTestL0FilesMisorderCorruptionWithParam,
+ DBCompactionTestL0FilesMisorderCorruptionWithParam,
+ ::testing::Values(CompactionStyle::kCompactionStyleUniversal,
+ CompactionStyle::kCompactionStyleFIFO));
+
+TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam,
+ FlushAfterIntraL0CompactFileWithIngestedFile) {
+ SetupOptions(GetParam(), "CompactFile");
+ DestroyAndReopen(options_);
+
+ // To create below LSM tree
+ // (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
+ //
+ // (1) Create an existing SST file s0
+ ASSERT_OK(Put("k1", "old"));
+ ASSERT_OK(Put("k3", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // (2) Create memtable m1. Noted that it contains a overlaped key with s0
+ ASSERT_OK(Put("k1", "new")); // overlapped key
+ ASSERT_OK(Put("k2", "new"));
+
+ // (3) Ingest two SST files s1, s2
+ IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
+ IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
+ // Up to now, L0 contains s0, s1, s2
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ ColumnFamilyMetaData cf_meta_data;
+ db_->GetColumnFamilyMetaData(&cf_meta_data);
+ ASSERT_EQ(cf_meta_data.levels[0].files.size(), 3);
+ std::vector<std::string> input_files;
+ for (const auto& file : cf_meta_data.levels[0].files) {
+ input_files.push_back(file.name);
}
- for (int i = 6; i < 10; ++i) {
- ASSERT_EQ(value2, Get(Key(i)));
+ ASSERT_EQ(input_files.size(), 3);
+
+ Status s = db_->CompactFiles(CompactionOptions(), input_files, 0);
+ // After compaction, we have LSM tree:
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
+ ASSERT_OK(s);
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ SequenceNumber compact_output_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ ASSERT_OK(Flush());
+ // After flush, we have LSM tree:
+ //
+ // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
+ // k1:old@1]
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ SequenceNumber flushed_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ // To verify there isn't any file misorder leading to returning a old value
+ // of "1" , which is caused by flushed table s4 has a smaller
+ // largest seqno than the compaction output file s3's largest seqno while the
+ // flushed table has the newer version of the value than the
+ // compaction output file's.
+ ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
+ EXPECT_EQ(Get("k1"), "new");
+
+ Destroy(options_);
+}
+
+TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam,
+ FlushAfterIntraL0CompactRangeWithIngestedFile) {
+ SetupOptions(GetParam(), "CompactRange");
+ DestroyAndReopen(options_);
+
+ // To create below LSM tree
+ // (key:value@n indicates key-value pair has seqno "n", L0 is sorted):
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1]
+ //
+ // (1) Create an existing SST file s0
+ ASSERT_OK(Put("k1", "old"));
+ ASSERT_OK(Put("k3", "old"));
+ ASSERT_OK(Flush());
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+ // (2) Create memtable m1. Noted that it contains a overlaped key with s0
+ ASSERT_OK(Put("k1", "new")); // overlapped key
+ ASSERT_OK(Put("k2", "new"));
+
+ // (3) Ingest two SST files s1, s2
+ IngestOneKeyValue(dbfull(), "k4", "dummy", options_);
+ IngestOneKeyValue(dbfull(), "k5", "dummy", options_);
+ // Up to now, L0 contains s0, s1, s2
+ ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+ if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ SetupSyncPoints("CompactRange");
+ }
+ // `start` and `end` is carefully chosen so that compact range:
+ // (1) doesn't overlap with memtable therefore the memtable won't be flushed
+ // (2) should target at compacting s0 with s1 and s2
+ Slice start("k3"), end("k5");
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+ // After compaction, we have LSM tree:
+ //
+ // memtable: m1 [ k2:new@4, k1:new@3]
+ // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1]
+ if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+ ASSERT_TRUE(SyncPointsCalled());
+ DisableSyncPoints();
}
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
+ SequenceNumber compact_output_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ ASSERT_OK(Flush());
+ // After flush, we have LSM tree:
+ //
+ // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2,
+ // k1:old@1]
+ ASSERT_EQ(2, NumTableFilesAtLevel(0));
+ SequenceNumber flushed_file_largest_seqno =
+ GetLatestL0FileLargestSeqnoHelper();
+
+ // To verify there isn't any file misorder leading to returning a old value
+ // of "k1" , which is caused by flushed table s4 has a smaller
+ // largest seqno than the compaction output file s3's largest seqno while the
+ // flushed table has the newer version of the value than the
+ // compaction output file's.
+ ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno);
+ EXPECT_EQ(Get("k1"), "new");
+
+ Destroy(options_);
}
TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
// Run ingestion jobs.
if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
+ mutex_.AssertHeld();
status = ingestion_jobs[i].Run();
if (!status.ok()) {
break;
num_running_ingest_file_++;
assert(!cfd->IsDropped());
+ mutex_.AssertHeld();
status = import_job.Run();
// Install job edit [Mutex will be unlocked here]
VersionEdit edit;
edit.SetColumnFamily(cfd->GetID());
+
for (const auto& f : vstorage->LevelFiles(level)) {
edit.DeleteFile(level, f->fd.GetNumber());
edit.AddFile(
to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
- f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
- f->file_checksum_func_name, f->unique_id);
+ f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
+ f->file_checksum, f->file_checksum_func_name, f->unique_id);
}
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
- f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
- f->unique_id);
+ f->file_creation_time, f->epoch_number, f->file_checksum,
+ f->file_checksum_func_name, f->unique_id);
ROCKS_LOG_BUFFER(
log_buffer,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
- f->file_creation_time, f->file_checksum,
+ f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id);
}
.PermitUncheckedError(); // ignore error
const uint64_t current_time = static_cast<uint64_t>(_current_time);
meta.oldest_ancester_time = current_time;
-
+ meta.epoch_number = cfd->NewEpochNumber();
{
auto write_hint = cfd->CalculateSSTWriteHint(0);
mutex_.Unlock();
constexpr int level = 0;
if (s.ok() && has_output) {
- edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
- meta.fd.GetFileSize(), meta.smallest, meta.largest,
- meta.fd.smallest_seqno, meta.fd.largest_seqno,
- meta.marked_for_compaction, meta.temperature,
- meta.oldest_blob_file_number, meta.oldest_ancester_time,
- meta.file_creation_time, meta.file_checksum,
- meta.file_checksum_func_name, meta.unique_id);
+ edit->AddFile(
+ level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
+ meta.smallest, meta.largest, meta.fd.smallest_seqno,
+ meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
+ meta.oldest_blob_file_number, meta.oldest_ancester_time,
+ meta.file_creation_time, meta.epoch_number, meta.file_checksum,
+ meta.file_checksum_func_name, meta.unique_id);
for (const auto& blob : blob_file_additions) {
edit->AddBlobFile(blob);
file_meta_from_files.file_creation_time);
ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+ ASSERT_EQ(file_meta_from_cf.epoch_number,
+ file_meta_from_files.epoch_number);
ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
// More from FileStorageInfo
ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
ASSERT_EQ(meta.oldest_blob_file_number,
expected_meta.oldest_blob_file_number);
+ ASSERT_EQ(meta.epoch_number, expected_meta.epoch_number);
// More from FileStorageInfo
ASSERT_EQ(meta.file_type, kTableFile);
#include "db/db_test_util.h"
#include "db/read_callback.h"
+#include "db/version_edit.h"
#include "options/options_helper.h"
#include "port/port.h"
#include "port/stack_trace.h"
class DBTest2 : public DBTestBase {
public:
DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
+ std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0) {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd =
+ versions->GetColumnFamilySet()->GetColumnFamily(cf);
+ assert(cfd);
+ Version* const current = cfd->current();
+ assert(current);
+ VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+ return storage_info->LevelFiles(level);
+ }
};
#ifndef ROCKSDB_LITE
ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, SortL0FilesByEpochNumber) {
+ Options options = CurrentOptions();
+ options.num_levels = 1;
+ options.compaction_style = kCompactionStyleUniversal;
+ DestroyAndReopen(options);
+
+ // Set up L0 files to be sorted by their epoch_number
+ ASSERT_OK(Put("key1", "seq1"));
+
+ SstFileWriter sst_file_writer{EnvOptions(), options};
+ std::string external_file1 = dbname_ + "/test_files1.sst";
+ std::string external_file2 = dbname_ + "/test_files2.sst";
+ ASSERT_OK(sst_file_writer.Open(external_file1));
+ ASSERT_OK(sst_file_writer.Put("key2", "seq0"));
+ ASSERT_OK(sst_file_writer.Finish());
+ ASSERT_OK(sst_file_writer.Open(external_file2));
+ ASSERT_OK(sst_file_writer.Put("key3", "seq0"));
+ ASSERT_OK(sst_file_writer.Finish());
+
+ ASSERT_OK(Put("key4", "seq2"));
+ ASSERT_OK(Flush());
+
+ auto* handle = db_->DefaultColumnFamily();
+ ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file2},
+ IngestExternalFileOptions()));
+
+ // To verify L0 files are sorted by epoch_number in descending order
+ // instead of largest_seqno
+ std::vector<FileMetaData*> level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 3);
+
+ EXPECT_EQ(level0_files[0]->epoch_number, 3);
+ EXPECT_EQ(level0_files[0]->fd.largest_seqno, 0);
+ ASSERT_EQ(level0_files[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key3"));
+
+ EXPECT_EQ(level0_files[1]->epoch_number, 2);
+ EXPECT_EQ(level0_files[1]->fd.largest_seqno, 0);
+ ASSERT_EQ(level0_files[1]->num_entries, 1);
+ ASSERT_TRUE(level0_files[1]->largest.user_key() == Slice("key2"));
+
+ EXPECT_EQ(level0_files[2]->epoch_number, 1);
+ EXPECT_EQ(level0_files[2]->fd.largest_seqno, 2);
+ ASSERT_EQ(level0_files[2]->num_entries, 2);
+ ASSERT_TRUE(level0_files[2]->largest.user_key() == Slice("key4"));
+ ASSERT_TRUE(level0_files[2]->smallest.user_key() == Slice("key1"));
+
+ // To verify compacted file is assigned with the minimum epoch_number
+ // among input files'
+ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+ level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 1);
+ EXPECT_EQ(level0_files[0]->epoch_number, 1);
+ ASSERT_EQ(level0_files[0]->num_entries, 4);
+ ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key4"));
+ ASSERT_TRUE(level0_files[0]->smallest.user_key() == Slice("key1"));
+}
+
+TEST_F(DBTest2, SameEpochNumberAfterCompactRangeChangeLevel) {
+ Options options = CurrentOptions();
+ options.num_levels = 7;
+ options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+
+ // Set up the file in L1 to be moved to L0 in later step of CompactRange()
+ ASSERT_OK(Put("key1", "seq1"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1, 0);
+ std::vector<FileMetaData*> level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 0);
+ std::vector<FileMetaData*> level1_files = GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level1_files.size(), 1);
+ std::vector<FileMetaData*> level2_files = GetLevelFileMetadatas(2 /* level*/);
+ ASSERT_EQ(level2_files.size(), 0);
+
+ ASSERT_EQ(level1_files[0]->epoch_number, 1);
+
+ // To verify CompactRange() moving file to L0 still keeps the file's
+ // epoch_number
+ CompactRangeOptions croptions;
+ croptions.change_level = true;
+ croptions.target_level = 0;
+ ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+ level0_files = GetLevelFileMetadatas(0 /* level*/);
+ level1_files = GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level0_files.size(), 1);
+ ASSERT_EQ(level1_files.size(), 0);
+
+ EXPECT_EQ(level0_files[0]->epoch_number, 1);
+
+ ASSERT_EQ(level0_files[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key1"));
+}
+
+TEST_F(DBTest2, RecoverEpochNumber) {
+ for (bool allow_ingest_behind : {true, false}) {
+ Options options = CurrentOptions();
+ options.allow_ingest_behind = allow_ingest_behind;
+ options.num_levels = 7;
+ options.compaction_style = kCompactionStyleLevel;
+ options.disable_auto_compactions = true;
+ DestroyAndReopen(options);
+ CreateAndReopenWithCF({"cf1"}, options);
+ VersionSet* versions = dbfull()->GetVersionSet();
+ assert(versions);
+ const ColumnFamilyData* default_cf =
+ versions->GetColumnFamilySet()->GetDefault();
+ const ColumnFamilyData* cf1 =
+ versions->GetColumnFamilySet()->GetColumnFamily("cf1");
+
+ // Set up files in default CF to recover in later step
+ ASSERT_OK(Put("key1", "epoch1"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1 /* level*/, 0 /* cf*/);
+ ASSERT_OK(Put("key2", "epoch2"));
+ ASSERT_OK(Flush());
+
+ std::vector<FileMetaData*> level0_files =
+ GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 1);
+ ASSERT_EQ(level0_files[0]->epoch_number,
+ allow_ingest_behind
+ ? 2 + kReservedEpochNumberForFileIngestedBehind
+ : 2);
+ ASSERT_EQ(level0_files[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2"));
+
+ std::vector<FileMetaData*> level1_files =
+ GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level1_files.size(), 1);
+ ASSERT_EQ(level1_files[0]->epoch_number,
+ allow_ingest_behind
+ ? 1 + kReservedEpochNumberForFileIngestedBehind
+ : 1);
+ ASSERT_EQ(level1_files[0]->num_entries, 1);
+ ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1"));
+
+ // Set up files in cf1 to recover in later step
+ ASSERT_OK(Put(1 /* cf */, "cf1_key1", "epoch1"));
+ ASSERT_OK(Flush(1 /* cf */));
+
+ std::vector<FileMetaData*> level0_files_cf1 =
+ GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/);
+ ASSERT_EQ(level0_files_cf1.size(), 1);
+ ASSERT_EQ(level0_files_cf1[0]->epoch_number,
+ allow_ingest_behind
+ ? 1 + kReservedEpochNumberForFileIngestedBehind
+ : 1);
+ ASSERT_EQ(level0_files_cf1[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1"));
+
+ ASSERT_EQ(default_cf->GetNextEpochNumber(),
+ allow_ingest_behind
+ ? 3 + kReservedEpochNumberForFileIngestedBehind
+ : 3);
+ ASSERT_EQ(cf1->GetNextEpochNumber(),
+ allow_ingest_behind
+ ? 2 + kReservedEpochNumberForFileIngestedBehind
+ : 2);
+
+ // To verify epoch_number of files of different levels/CFs are
+ // persisted and recovered correctly
+ ReopenWithColumnFamilies({"default", "cf1"}, options);
+ versions = dbfull()->GetVersionSet();
+ assert(versions);
+ default_cf = versions->GetColumnFamilySet()->GetDefault();
+ cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1");
+
+ level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 1);
+ EXPECT_EQ(level0_files[0]->epoch_number,
+ allow_ingest_behind
+ ? 2 + kReservedEpochNumberForFileIngestedBehind
+ : 2);
+ ASSERT_EQ(level0_files[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2"));
+
+ level1_files = GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level1_files.size(), 1);
+ EXPECT_EQ(level1_files[0]->epoch_number,
+ allow_ingest_behind
+ ? 1 + kReservedEpochNumberForFileIngestedBehind
+ : 1);
+ ASSERT_EQ(level1_files[0]->num_entries, 1);
+ ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1"));
+
+ level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/);
+ ASSERT_EQ(level0_files_cf1.size(), 1);
+ EXPECT_EQ(level0_files_cf1[0]->epoch_number,
+ allow_ingest_behind
+ ? 1 + kReservedEpochNumberForFileIngestedBehind
+ : 1);
+ ASSERT_EQ(level0_files_cf1[0]->num_entries, 1);
+ ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1"));
+
+ // To verify next epoch number is recovered correctly
+ EXPECT_EQ(default_cf->GetNextEpochNumber(),
+ allow_ingest_behind
+ ? 3 + kReservedEpochNumberForFileIngestedBehind
+ : 3);
+ EXPECT_EQ(cf1->GetNextEpochNumber(),
+ allow_ingest_behind
+ ? 2 + kReservedEpochNumberForFileIngestedBehind
+ : 2);
+ }
+}
+
+#endif // ROCKSDB_LITE
+
TEST_F(DBTest2, RenameDirectory) {
Options options = CurrentOptions();
DestroyAndReopen(options);
lf->smallest, lf->largest, lf->fd.smallest_seqno,
lf->fd.largest_seqno, lf->marked_for_compaction, temp,
lf->oldest_blob_file_number, lf->oldest_ancester_time,
- lf->file_creation_time, lf->file_checksum,
+ lf->file_creation_time, lf->epoch_number, lf->file_checksum,
lf->file_checksum_func_name, lf->unique_id);
}
}
f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
- oldest_ancester_time, current_time, f.file_checksum,
- f.file_checksum_func_name, f.unique_id);
+ oldest_ancester_time, current_time,
+ ingestion_options_.ingest_behind
+ ? kReservedEpochNumberForFileIngestedBehind
+ : cfd_->NewEpochNumber(),
+ f.file_checksum, f.file_checksum_func_name, f.unique_id);
f_metadata.temperature = f.file_temperature;
edit_.AddFile(f.picked_level, f_metadata);
}
#include "db/memtable_list.h"
#include "db/merge_context.h"
#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
#include "db/version_set.h"
#include "file/file_util.h"
#include "file/filename.h"
// path 0 for level 0 file.
meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+ meta_.epoch_number = cfd_->NewEpochNumber();
base_ = cfd_->current();
base_->Ref(); // it is likely that we do not need this reference
meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
meta_.marked_for_compaction, meta_.temperature,
meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
- meta_.file_creation_time, meta_.file_checksum,
- meta_.file_checksum_func_name, meta_.unique_id);
+ meta_.file_creation_time, meta_.epoch_number,
+ meta_.file_checksum, meta_.file_checksum_func_name,
+ meta_.unique_id);
edit_->SetBlobFileAdditions(std::move(blob_file_additions));
}
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
+#include "db/version_builder.h"
#ifndef ROCKSDB_LITE
#include "db/import_column_family_job.h"
static_cast<uint64_t>(temp_current_time);
}
- VersionBuilder version_builder(
+ // Recover files' epoch number using dummy VersionStorageInfo
+ VersionBuilder dummy_version_builder(
cfd_->current()->version_set()->file_options(), cfd_->ioptions(),
cfd_->table_cache(), cfd_->current()->storage_info(),
cfd_->current()->version_set(),
cfd_->GetFileMetadataCacheReservationManager());
- VersionStorageInfo vstorage(
+ VersionStorageInfo dummy_vstorage(
&cfd_->internal_comparator(), cfd_->user_comparator(),
cfd_->NumberLevels(), cfd_->ioptions()->compaction_style,
- nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks);
+ nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks,
+ EpochNumberRequirement::kMightMissing);
Status s;
for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) {
const auto& f = files_to_import_[i];
const auto& file_metadata = metadata_[i];
- VersionEdit version_edit;
- version_edit.AddFile(
+ VersionEdit dummy_version_edit;
+ dummy_version_edit.AddFile(
file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key,
file_metadata.smallest_seqno, file_metadata.largest_seqno, false,
file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time,
- current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- f.unique_id);
- s = version_builder.Apply(&version_edit);
+ current_time, file_metadata.epoch_number, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, f.unique_id);
+ s = dummy_version_builder.Apply(&dummy_version_edit);
}
if (s.ok()) {
- s = version_builder.SaveTo(&vstorage);
+ s = dummy_version_builder.SaveTo(&dummy_vstorage);
}
+ if (s.ok()) {
+ dummy_vstorage.RecoverEpochNumbers(cfd_);
+ }
+
+ // Record changes from this CF import in VersionEdit, including files with
+ // recovered epoch numbers
if (s.ok()) {
edit_.SetColumnFamily(cfd_->GetID());
- for (int level = 0; level < vstorage.num_levels(); level++) {
- for (FileMetaData* file_meta : vstorage.LevelFiles(level)) {
+ for (int level = 0; level < dummy_vstorage.num_levels(); level++) {
+ for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
edit_.AddFile(level, *file_meta);
// If incoming sequence number is higher, update local sequence number.
if (file_meta->fd.largest_seqno > versions_->LastSequence()) {
}
}
}
- for (int level = 0; level < vstorage.num_levels(); level++) {
- for (FileMetaData* file_meta : vstorage.LevelFiles(level)) {
+
+ // Release resources occupied by the dummy VersionStorageInfo
+ for (int level = 0; level < dummy_vstorage.num_levels(); level++) {
+ for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
file_meta->refs--;
if (file_meta->refs <= 0) {
delete file_meta;
return status;
}
-
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
+#include "db/version_builder.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
for (const auto& cf_id_and_tables : cf_id_to_tables) {
auto* cfd =
vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
- VersionEdit edit;
- edit.SetComparatorName(cfd->user_comparator()->Name());
- edit.SetLogNumber(0);
- edit.SetNextFile(next_file_number_);
- edit.SetColumnFamily(cfd->GetID());
- // TODO(opt): separate out into multiple levels
+ // Recover files' epoch number using dummy VersionStorageInfo
+ VersionBuilder dummy_version_builder(
+ cfd->current()->version_set()->file_options(), cfd->ioptions(),
+ cfd->table_cache(), cfd->current()->storage_info(),
+ cfd->current()->version_set(),
+ cfd->GetFileMetadataCacheReservationManager());
+ VersionStorageInfo dummy_vstorage(
+ &cfd->internal_comparator(), cfd->user_comparator(),
+ cfd->NumberLevels(), cfd->ioptions()->compaction_style,
+ nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks,
+ EpochNumberRequirement::kMightMissing);
+ Status s;
+ VersionEdit dummy_edit;
for (const auto* table : cf_id_and_tables.second) {
- edit.AddFile(
+ // TODO(opt): separate out into multiple levels
+ dummy_edit.AddFile(
0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
table->meta.fd.GetFileSize(), table->meta.smallest,
table->meta.largest, table->meta.fd.smallest_seqno,
table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
table->meta.temperature, table->meta.oldest_blob_file_number,
table->meta.oldest_ancester_time, table->meta.file_creation_time,
- table->meta.file_checksum, table->meta.file_checksum_func_name,
- table->meta.unique_id);
+ table->meta.epoch_number, table->meta.file_checksum,
+ table->meta.file_checksum_func_name, table->meta.unique_id);
}
- assert(next_file_number_ > 0);
- vset_.MarkFileNumberUsed(next_file_number_ - 1);
- mutex_.Lock();
- std::unique_ptr<FSDirectory> db_dir;
- Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
- &db_dir, nullptr);
- if (status.ok()) {
- status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
- &edit, &mutex_, db_dir.get(),
- false /* new_descriptor_log */);
+ s = dummy_version_builder.Apply(&dummy_edit);
+ if (s.ok()) {
+ s = dummy_version_builder.SaveTo(&dummy_vstorage);
}
- mutex_.Unlock();
- if (!status.ok()) {
- return status;
+ if (s.ok()) {
+ dummy_vstorage.RecoverEpochNumbers(cfd);
+ }
+ if (s.ok()) {
+ // Record changes from this repair in VersionEdit, including files with
+ // recovered epoch numbers
+ VersionEdit edit;
+ edit.SetComparatorName(cfd->user_comparator()->Name());
+ edit.SetLogNumber(0);
+ edit.SetNextFile(next_file_number_);
+ edit.SetColumnFamily(cfd->GetID());
+ for (int level = 0; level < dummy_vstorage.num_levels(); ++level) {
+ for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
+ edit.AddFile(level, *file_meta);
+ }
+ }
+
+ // Release resources occupied by the dummy VersionStorageInfo
+ for (int level = 0; level < dummy_vstorage.num_levels(); ++level) {
+ for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
+ file_meta->refs--;
+ if (file_meta->refs <= 0) {
+ delete file_meta;
+ }
+ }
+ }
+
+ // Persist record of changes
+ assert(next_file_number_ > 0);
+ vset_.MarkFileNumberUsed(next_file_number_ - 1);
+ mutex_.Lock();
+ std::unique_ptr<FSDirectory> db_dir;
+ s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir,
+ nullptr);
+ if (s.ok()) {
+ s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+ &mutex_, db_dir.get(),
+ false /* new_descriptor_log */);
+ }
+ mutex_.Unlock();
+ }
+ if (!s.ok()) {
+ return s;
}
}
return Status::OK();
ASSERT_GT(verify_passed, 0);
SyncPoint::GetInstance()->DisableProcessing();
}
+
+ std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0) {
+ VersionSet* const versions = dbfull()->GetVersionSet();
+ assert(versions);
+ ColumnFamilyData* const cfd =
+ versions->GetColumnFamilySet()->GetColumnFamily(cf);
+ assert(cfd);
+ Version* const current = cfd->current();
+ assert(current);
+ VersionStorageInfo* const storage_info = current->storage_info();
+ assert(storage_info);
+ return storage_info->LevelFiles(level);
+ }
};
+TEST_F(RepairTest, SortRepairedDBL0ByEpochNumber) {
+ Options options = CurrentOptions();
+ DestroyAndReopen(options);
+
+ ASSERT_OK(Put("k1", "oldest"));
+ ASSERT_OK(Put("k1", "older"));
+ ASSERT_OK(Flush());
+ MoveFilesToLevel(1);
+
+ ASSERT_OK(Put("k1", "old"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(Put("k1", "new"));
+
+ std::vector<FileMetaData*> level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 1);
+ ASSERT_EQ(level0_files[0]->epoch_number, 2);
+ std::vector<FileMetaData*> level1_files = GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level1_files.size(), 1);
+ ASSERT_EQ(level1_files[0]->epoch_number, 1);
+
+ std::string manifest_path =
+ DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+ Close();
+ ASSERT_OK(env_->FileExists(manifest_path));
+ ASSERT_OK(env_->DeleteFile(manifest_path));
+
+ ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+ ReopenWithSstIdVerify();
+
+ EXPECT_EQ(Get("k1"), "new");
+
+ level0_files = GetLevelFileMetadatas(0 /* level*/);
+ ASSERT_EQ(level0_files.size(), 3);
+ EXPECT_EQ(level0_files[0]->epoch_number, 3);
+ EXPECT_EQ(level0_files[1]->epoch_number, 2);
+ EXPECT_EQ(level0_files[2]->epoch_number, 1);
+ level1_files = GetLevelFileMetadatas(1 /* level*/);
+ ASSERT_EQ(level1_files.size(), 0);
+}
+
TEST_F(RepairTest, LostManifest) {
// Add a couple SST files, delete the manifest, and verify RepairDB() saves
// the day.
#include "db/dbformat.h"
#include "db/internal_stats.h"
#include "db/table_cache.h"
+#include "db/version_edit.h"
#include "db/version_set.h"
#include "port/port.h"
#include "table/table_reader.h"
namespace ROCKSDB_NAMESPACE {
class VersionBuilder::Rep {
- class NewestFirstBySeqNo {
+ class NewestFirstByEpochNumber {
+ private:
+ inline static const NewestFirstBySeqNo seqno_cmp;
+
public:
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
assert(lhs);
assert(rhs);
- if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
- return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
- }
-
- if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
- return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
+ if (lhs->epoch_number != rhs->epoch_number) {
+ return lhs->epoch_number > rhs->epoch_number;
+ } else {
+ return seqno_cmp(lhs, rhs);
}
-
- // Break ties by file number
- return lhs->fd.GetNumber() > rhs->fd.GetNumber();
}
};
-
class BySmallestKey {
public:
explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
std::unordered_map<uint64_t, int> table_file_levels_;
// Current compact cursors that should be changed after the last compaction
std::unordered_map<int, InternalKey> updated_compact_cursors_;
- NewestFirstBySeqNo level_zero_cmp_;
+ NewestFirstByEpochNumber level_zero_cmp_by_epochno_;
+ NewestFirstBySeqNo level_zero_cmp_by_seqno_;
BySmallestKey level_nonzero_cmp_;
// Mutable metadata objects for all blob files affected by the series of
ExpectedLinkedSsts expected_linked_ssts;
if (num_levels_ > 0) {
+ const InternalKeyComparator* const icmp = vstorage->InternalComparator();
+ EpochNumberRequirement epoch_number_requirement =
+ vstorage->GetEpochNumberRequirement();
+ assert(icmp);
// Check L0
{
- auto l0_checker = [this](const FileMetaData* lhs,
- const FileMetaData* rhs) {
+ auto l0_checker = [this, epoch_number_requirement, icmp](
+ const FileMetaData* lhs,
+ const FileMetaData* rhs) {
assert(lhs);
assert(rhs);
- if (!level_zero_cmp_(lhs, rhs)) {
- std::ostringstream oss;
- oss << "L0 files are not sorted properly: files #"
- << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
-
- return Status::Corruption("VersionBuilder", oss.str());
- }
-
- if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
- // This is an external file that we ingested
- const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
-
- if (!(external_file_seqno < lhs->fd.largest_seqno ||
- external_file_seqno == 0)) {
+ if (epoch_number_requirement ==
+ EpochNumberRequirement::kMightMissing) {
+ if (!level_zero_cmp_by_seqno_(lhs, rhs)) {
std::ostringstream oss;
- oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
- << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
- << " vs. file #" << rhs->fd.GetNumber()
- << " with global_seqno " << external_file_seqno;
-
+ oss << "L0 files are not sorted properly: files #"
+ << lhs->fd.GetNumber() << " with seqnos (largest, smallest) "
+ << lhs->fd.largest_seqno << " , " << lhs->fd.smallest_seqno
+ << ", #" << rhs->fd.GetNumber()
+ << " with seqnos (largest, smallest) "
+ << rhs->fd.largest_seqno << " , " << rhs->fd.smallest_seqno;
return Status::Corruption("VersionBuilder", oss.str());
}
- } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
- std::ostringstream oss;
- oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
- << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
- << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
- << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
+ } else if (epoch_number_requirement ==
+ EpochNumberRequirement::kMustPresent) {
+ if (lhs->epoch_number == rhs->epoch_number) {
+ bool range_overlapped =
+ icmp->Compare(lhs->smallest, rhs->largest) <= 0 &&
+ icmp->Compare(lhs->largest, rhs->smallest) >= 0;
+
+ if (range_overlapped) {
+ std::ostringstream oss;
+ oss << "L0 files of same epoch number but overlapping range #"
+ << lhs->fd.GetNumber()
+ << " , smallest key: " << lhs->smallest.DebugString(true)
+ << " , largest key: " << lhs->largest.DebugString(true)
+ << " , epoch number: " << lhs->epoch_number << " vs. file #"
+ << rhs->fd.GetNumber()
+ << " , smallest key: " << rhs->smallest.DebugString(true)
+ << " , largest key: " << rhs->largest.DebugString(true)
+ << " , epoch number: " << rhs->epoch_number;
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
+ }
- return Status::Corruption("VersionBuilder", oss.str());
+ if (!level_zero_cmp_by_epochno_(lhs, rhs)) {
+ std::ostringstream oss;
+ oss << "L0 files are not sorted properly: files #"
+ << lhs->fd.GetNumber() << " with epoch number "
+ << lhs->epoch_number << ", #" << rhs->fd.GetNumber()
+ << " with epoch number " << rhs->epoch_number;
+ return Status::Corruption("VersionBuilder", oss.str());
+ }
}
return Status::OK();
}
// Check L1 and up
- const InternalKeyComparator* const icmp = vstorage->InternalComparator();
- assert(icmp);
for (int level = 1; level < num_levels_; ++level) {
auto checker = [this, level, icmp](const FileMetaData* lhs,
}
}
+ bool PromoteEpochNumberRequirementIfNeeded(
+ VersionStorageInfo* vstorage) const {
+ if (vstorage->HasMissingEpochNumber()) {
+ return false;
+ }
+
+ for (int level = 0; level < num_levels_; ++level) {
+ for (const auto& pair : levels_[level].added_files) {
+ const FileMetaData* f = pair.second;
+ if (f->epoch_number == kUnknownEpochNumber) {
+ return false;
+ }
+ }
+ }
+
+ vstorage->SetEpochNumberRequirement(EpochNumberRequirement::kMustPresent);
+ return true;
+ }
+
void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
assert(vstorage);
return;
}
- SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
+ EpochNumberRequirement epoch_number_requirement =
+ vstorage->GetEpochNumberRequirement();
+
+ if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
+ bool promoted = PromoteEpochNumberRequirementIfNeeded(vstorage);
+ if (promoted) {
+ epoch_number_requirement = vstorage->GetEpochNumberRequirement();
+ }
+ }
+
+ if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
+ SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_);
+ } else {
+ SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_);
+ }
for (int level = 1; level < num_levels_; ++level) {
SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
#include <memory>
+#include "db/version_edit.h"
#include "rocksdb/file_system.h"
+#include "rocksdb/metadata.h"
#include "rocksdb/slice_transform.h"
namespace ROCKSDB_NAMESPACE {
Version* version_;
};
+class NewestFirstBySeqNo {
+ public:
+ bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+ assert(lhs);
+ assert(rhs);
+
+ if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
+ return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
+ }
+
+ if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
+ return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
+ }
+
+ // Break ties by file number
+ return lhs->fd.GetNumber() > rhs->fd.GetNumber();
+ }
+};
} // namespace ROCKSDB_NAMESPACE
uint64_t num_entries = 0, uint64_t num_deletions = 0,
bool sampled = false, SequenceNumber smallest_seqno = 0,
SequenceNumber largest_seqno = 0,
- uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+ uint64_t oldest_blob_file_number = kInvalidBlobFileNumber,
+ uint64_t epoch_number = kUnknownEpochNumber) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData(
file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
/* marked_for_compact */ false, Temperature::kUnknown,
oldest_blob_file_number, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
f->compensated_file_size = file_size;
f->num_entries = num_entries;
vstorage_.AddBlobFile(std::move(meta));
}
- void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) {
+ void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number,
+ uint64_t epoch_number) {
constexpr int level = 0;
constexpr char smallest[] = "bar";
constexpr char largest[] = "foo";
Add(level, table_file_number, smallest, largest, file_size, path_id,
smallest_seq, largest_seq, num_entries, num_deletions, sampled,
- smallest_seq, largest_seq, blob_file_number);
+ smallest_seq, largest_seq, blob_file_number, epoch_number);
}
void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number,
- uint64_t blob_file_number) {
+ uint64_t blob_file_number, uint64_t epoch_number) {
assert(edit);
constexpr int level = 0;
level, table_file_number, path_id, file_size, GetInternalKey(smallest),
GetInternalKey(largest), smallest_seqno, largest_seqno,
marked_for_compaction, Temperature::kUnknown, blob_file_number,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
}
}
TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
- Add(0, 1U, "150", "200", 100U);
+ Add(0, 1U, "150", "200", 100U, /*path_id*/ 0,
+ /*smallest_seq*/ 100, /*largest_seq*/ 100,
+ /*num_entries*/ 0, /*num_deletions*/ 0,
+ /*sampled*/ false, /*smallest_seqno*/ 0,
+ /*largest_seqno*/ 0,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 1);
Add(1, 66U, "150", "200", 100U);
Add(1, 88U, "201", "300", 100U);
version_edit.AddFile(
2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.DeleteFile(3, 27U);
TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
ioptions_.level_compaction_dynamic_level_bytes = true;
- Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
- Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 2);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 1);
Add(4, 6U, "150", "179", 100U);
Add(4, 7U, "180", "220", 100U);
version_edit.AddFile(
3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
ioptions_.level_compaction_dynamic_level_bytes = true;
- Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
- Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+ Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 2);
+ Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 1);
Add(4, 6U, "150", "179", 100U);
Add(4, 7U, "180", "220", 100U);
version_edit.AddFile(
4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
version_edit.AddFile(
2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
EnvOptions env_options;
version_edit.AddFile(
2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit.AddFile(
2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
ASSERT_OK(version_builder.Apply(&version_edit));
version_edit.AddFile(
2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
version_edit2.DeleteFile(2, 616);
version_edit2.DeleteFile(2, 636);
version_edit.AddFile(
2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
false, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
ASSERT_OK(version_builder.Apply(&version_edit2));
constexpr bool marked_for_compaction = false;
- addition.AddFile(level, file_number, path_id, file_size,
- GetInternalKey(smallest, smallest_seq),
- GetInternalKey(largest, largest_seq), smallest_seqno,
- largest_seqno, marked_for_compaction, Temperature::kUnknown,
- kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ addition.AddFile(
+ level, file_number, path_id, file_size,
+ GetInternalKey(smallest, smallest_seq),
+ GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
ASSERT_OK(builder.Apply(&addition));
new_level, file_number, path_id, file_size, GetInternalKey(smallest),
GetInternalKey(largest), smallest_seqno, largest_seqno,
marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
const Status s = builder.Apply(&edit);
constexpr SequenceNumber largest_seqno = 1000;
constexpr bool marked_for_compaction = false;
- edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest),
- GetInternalKey(largest), smallest_seqno, largest_seqno,
- marked_for_compaction, Temperature::kUnknown,
- kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ edit.AddFile(
+ level, file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
ASSERT_OK(builder.Apply(&edit));
new_level, file_number, path_id, file_size, GetInternalKey(smallest),
GetInternalKey(largest), smallest_seqno, largest_seqno,
marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
const Status s = builder.Apply(&other_edit);
level, file_number, path_id, file_size, GetInternalKey(smallest),
GetInternalKey(largest), smallest_seqno, largest_seqno,
marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber,
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
ASSERT_OK(builder.Apply(&addition));
// Add dummy table file to ensure the blob file is referenced.
constexpr uint64_t table_file_number = 1;
- AddDummyFileToEdit(&edit, table_file_number, blob_file_number);
+ AddDummyFileToEdit(&edit, table_file_number, blob_file_number,
+ 1 /*epoch_number*/);
ASSERT_OK(builder.Apply(&edit));
ASSERT_NE(meta, nullptr);
// Add dummy table file to ensure the blob file is referenced.
- AddDummyFile(table_file_number, blob_file_number);
+ AddDummyFile(table_file_number, blob_file_number, 1 /*epoch_number*/);
UpdateVersionStorageInfo();
// Add dummy table file to ensure the blob file is referenced.
constexpr uint64_t table_file_number = 1;
- AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+ AddDummyFileToEdit(&addition, table_file_number, blob_file_number,
+ 1 /*epoch_number*/);
ASSERT_OK(builder.Apply(&addition));
// Add dummy table file to ensure the blob file is referenced.
constexpr uint64_t table_file_number = 1;
- AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+ AddDummyFileToEdit(&addition, table_file_number, blob_file_number,
+ 1 /*epoch_number*/);
ASSERT_OK(builder.Apply(&addition));
const uint64_t table_file_number = 2 * i;
const uint64_t blob_file_number = 2 * i + 1;
- AddDummyFile(table_file_number, blob_file_number);
+ AddDummyFile(table_file_number, blob_file_number, i /*epoch_number*/);
}
UpdateVersionStorageInfo();
constexpr uint64_t garbage_blob_count = 0;
constexpr uint64_t garbage_blob_bytes = 0;
- AddDummyFile(base_table_file_number, base_blob_file_number);
+ AddDummyFile(base_table_file_number, base_blob_file_number,
+ 1 /*epoch_number*/);
AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes,
checksum_method, checksum_value,
BlobFileMetaData::LinkedSsts{base_table_file_number},
constexpr uint64_t total_blob_count = 234;
constexpr uint64_t total_blob_bytes = 1 << 22;
- edit.AddFile(level, table_file_number, path_id, file_size,
- GetInternalKey(smallest), GetInternalKey(largest),
- smallest_seqno, largest_seqno, marked_for_compaction,
- Temperature::kUnknown, blob_file_number,
- kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- checksum_value, checksum_method, kNullUniqueId64x2);
+ edit.AddFile(
+ level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+ GetInternalKey(largest), smallest_seqno, largest_seqno,
+ marked_for_compaction, Temperature::kUnknown, blob_file_number,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/,
+ checksum_value, checksum_method, kNullUniqueId64x2);
edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
checksum_method, checksum_value);
/* largest_seqno */ 200, /* marked_for_compaction */ false,
Temperature::kUnknown,
/* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
/* file_size */ 100, /* smallest */ GetInternalKey("801"),
/* largest_seqno */ 200, /* marked_for_compaction */ false,
Temperature::kUnknown,
/* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
/* total_blob_bytes */ 200000,
/* checksum_method */ std::string(),
/* largest_seqno */ 2100, /* marked_for_compaction */ false,
Temperature::kUnknown,
/* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
// Add an SST that does not reference any blob files.
/* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200,
/* largest_seqno */ 2200, /* marked_for_compaction */ false,
Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
// Delete a file that references a blob file.
/* largest_seqno */ 300, /* marked_for_compaction */ false,
Temperature::kUnknown,
/* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
// Trivially move a file that does not reference any blob files.
edit.DeleteFile(/* level */ 1, /* file_number */ 13);
/* largest_seqno */ 1300, /* marked_for_compaction */ false,
Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ kUnknownEpochNumber, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
// Add one more SST file that references a blob file, then promptly
// delete it in a second version edit before the new version gets saved.
/* largest_seqno */ 2300, /* marked_for_compaction */ false,
Temperature::kUnknown,
/* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ kUnknownFileCreationTime, kUnknownEpochNumber,
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
VersionEdit edit2;
}
TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) {
- Add(0, 1U, "150", "200", 100U);
+ Add(0, 1U, "150", "200", 100, /*path_id*/ 0,
+ /*smallest_seq*/ 100, /*largest_seq*/ 100,
+ /*num_entries*/ 0, /*num_deletions*/ 0,
+ /*sampled*/ false, /*smallest_seqno*/ 0,
+ /*largest_seqno*/ 0,
+ /*oldest_blob_file_number*/ kInvalidBlobFileNumber,
+ /*epoch_number*/ 1);
UpdateVersionStorageInfo();
UnrefFilesInVersion(&new_vstorage2);
}
+TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) {
+ Status s;
+ // To verify files of same epoch number of overlapping ranges are caught as
+ // corrupted
+ VersionEdit version_edit_1;
+ version_edit_1.AddFile(
+ /* level */ 0, /* file_number */ 1U, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("a", 1),
+ /* largest */ GetInternalKey("c", 3), /* smallest_seqno */ 1,
+ /* largest_seqno */ 3, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ version_edit_1.AddFile(
+ /* level */ 0, /* file_number */ 2U, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("b", 2),
+ /* largest */ GetInternalKey("d", 4), /* smallest_seqno */ 2,
+ /* largest_seqno */ 4, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+
+ VersionBuilder version_builder_1(EnvOptions(), &ioptions_,
+ nullptr /* table_cache */, &vstorage_,
+ nullptr /* file_metadata_cache_res_mgr */);
+ VersionStorageInfo new_vstorage_1(
+ &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+ nullptr /* src_vstorage */, true /* force_consistency_checks */);
+
+ ASSERT_OK(version_builder_1.Apply(&version_edit_1));
+ s = version_builder_1.SaveTo(&new_vstorage_1);
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_TRUE(std::strstr(
+ s.getState(), "L0 files of same epoch number but overlapping range"));
+ UnrefFilesInVersion(&new_vstorage_1);
+
+ // To verify L0 files not sorted by epoch_number are caught as corrupted
+ VersionEdit version_edit_2;
+ version_edit_2.AddFile(
+ /* level */ 0, /* file_number */ 1U, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("a", 1),
+ /* largest */ GetInternalKey("a", 1), /* smallest_seqno */ 1,
+ /* largest_seqno */ 1, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+ version_edit_2.AddFile(
+ /* level */ 0, /* file_number */ 2U, /* path_id */ 0,
+ /* file_size */ 100, /* smallest */ GetInternalKey("b", 2),
+ /* largest */ GetInternalKey("b", 2), /* smallest_seqno */ 2,
+ /* largest_seqno */ 2, /* marked_for_compaction */ false,
+ Temperature::kUnknown,
+ /* oldest_blob_file_number */ kInvalidBlobFileNumber,
+ kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+ 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
+
+ VersionBuilder version_builder_2(EnvOptions(), &ioptions_,
+ nullptr /* table_cache */, &vstorage_,
+ nullptr /* file_metadata_cache_res_mgr */);
+ VersionStorageInfo new_vstorage_2(
+ &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+ nullptr /* src_vstorage */, true /* force_consistency_checks */);
+
+ ASSERT_OK(version_builder_2.Apply(&version_edit_2));
+ s = version_builder_2.SaveTo(&new_vstorage_2);
+ ASSERT_TRUE(s.ok());
+
+ const std::vector<FileMetaData*>& l0_files = new_vstorage_2.LevelFiles(0);
+ ASSERT_EQ(l0_files.size(), 2);
+ // Manually corrupt L0 files's epoch_number
+ l0_files[0]->epoch_number = 1;
+ l0_files[1]->epoch_number = 2;
+
+ // To surface corruption error by applying dummy version edit
+ VersionEdit dummy_version_edit;
+ VersionBuilder dummy_version_builder(
+ EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_,
+ nullptr /* file_metadata_cache_res_mgr */);
+ ASSERT_OK(dummy_version_builder.Apply(&dummy_version_edit));
+ s = dummy_version_builder.SaveTo(&new_vstorage_2);
+ EXPECT_TRUE(s.IsCorruption());
+ EXPECT_TRUE(std::strstr(s.getState(), "L0 files are not sorted properly"));
+
+ UnrefFilesInVersion(&new_vstorage_2);
+}
+
TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
const uint32_t kTotalSamples = 20;
const uint32_t kNumLevels = 5;
bool min_log_num_written = false;
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
- if (!f.smallest.Valid() || !f.largest.Valid()) {
+ if (!f.smallest.Valid() || !f.largest.Valid() ||
+ f.epoch_number == kUnknownEpochNumber) {
return false;
}
PutVarint32(dst, kNewFile4);
&varint_file_creation_time);
PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+ PutVarint32(dst, NewFileCustomTag::kEpochNumber);
+ std::string varint_epoch_number;
+ PutVarint64(&varint_epoch_number, f.epoch_number);
+ PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
+
PutVarint32(dst, NewFileCustomTag::kFileChecksum);
PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
return "invalid file creation time";
}
break;
+ case kEpochNumber:
+ if (!GetVarint64(&field, &f.epoch_number)) {
+ return "invalid epoch number";
+ }
+ break;
case kFileChecksum:
f.file_checksum = field.ToString();
break;
AppendNumberTo(&r, f.oldest_ancester_time);
r.append(" file_creation_time:");
AppendNumberTo(&r, f.file_creation_time);
+ r.append(" epoch_number:");
+ AppendNumberTo(&r, f.epoch_number);
r.append(" file_checksum:");
r.append(Slice(f.file_checksum).ToString(true));
r.append(" file_checksum_func_name: ");
jw << "LargestIKey" << f.largest.DebugString(hex_key);
jw << "OldestAncesterTime" << f.oldest_ancester_time;
jw << "FileCreationTime" << f.file_creation_time;
+ jw << "EpochNumber" << f.epoch_number;
jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
jw << "FileChecksumFuncName" << f.file_checksum_func_name;
if (f.temperature != Temperature::kUnknown) {
kMinTimestamp = 10,
kMaxTimestamp = 11,
kUniqueId = 12,
+ kEpochNumber = 13,
// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
constexpr uint64_t kUnknownOldestAncesterTime = 0;
constexpr uint64_t kUnknownFileCreationTime = 0;
+constexpr uint64_t kUnknownEpochNumber = 0;
+// If `Options::allow_ingest_behind` is true, this epoch number
+// will be dedicated to files ingested behind.
+constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1;
extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
// Unix time when the SST file is created.
uint64_t file_creation_time = kUnknownFileCreationTime;
+ // The order of a file being flushed or ingested/imported.
+ // Compaction output file will be assigned with the minimum `epoch_number`
+ // among input files'.
+ // For L0, larger `epoch_number` indicates newer L0 file.
+ uint64_t epoch_number = kUnknownEpochNumber;
+
// File checksum
std::string file_checksum = kUnknownFileChecksum;
const SequenceNumber& largest_seq, bool marked_for_compact,
Temperature _temperature, uint64_t oldest_blob_file,
uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
- const std::string& _file_checksum,
+ uint64_t _epoch_number, const std::string& _file_checksum,
const std::string& _file_checksum_func_name,
UniqueId64x2 _unique_id)
: fd(file, file_path_id, file_size, smallest_seq, largest_seq),
oldest_blob_file_number(oldest_blob_file),
oldest_ancester_time(_oldest_ancester_time),
file_creation_time(_file_creation_time),
+ epoch_number(_epoch_number),
file_checksum(_file_checksum),
file_checksum_func_name(_file_checksum_func_name),
unique_id(std::move(_unique_id)) {
const SequenceNumber& largest_seqno, bool marked_for_compaction,
Temperature temperature, uint64_t oldest_blob_file_number,
uint64_t oldest_ancester_time, uint64_t file_creation_time,
- const std::string& file_checksum,
+ uint64_t epoch_number, const std::string& file_checksum,
const std::string& file_checksum_func_name,
const UniqueId64x2& unique_id) {
assert(smallest_seqno <= largest_seqno);
FileMetaData(file, file_path_id, file_size, smallest, largest,
smallest_seqno, largest_seqno, marked_for_compaction,
temperature, oldest_blob_file_number, oldest_ancester_time,
- file_creation_time, file_checksum, file_checksum_func_name,
- unique_id));
+ file_creation_time, epoch_number, file_checksum,
+ file_checksum_func_name, unique_id));
if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
SetLastSequence(largest_seqno);
}
#include "db/blob/blob_file_reader.h"
#include "db/blob/blob_source.h"
+#include "db/version_edit.h"
#include "logging/logging.h"
#include "monitoring/persistent_stats_history.h"
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, bool track_missing_files,
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
- bool skip_load_table_files)
+ bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement)
: VersionEditHandlerBase(),
read_only_(read_only),
column_families_(std::move(column_families)),
no_error_if_files_missing_(no_error_if_files_missing),
io_tracer_(io_tracer),
skip_load_table_files_(skip_load_table_files),
- initialized_(false) {
+ initialized_(false),
+ epoch_number_requirement_(epoch_number_requirement) {
assert(version_set_ != nullptr);
}
}
}
}
+
if (s->ok()) {
for (auto* cfd : *(version_set_->column_family_set_)) {
if (cfd->IsDropped()) {
auto* builder = builder_iter->second->version_builder();
auto* v = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_,
- version_set_->current_version_number_++);
+ version_set_->current_version_number_++,
+ epoch_number_requirement_);
s = builder->SaveTo(v->storage_info());
if (s.ok()) {
// Install new version
VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
- VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
+ VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
+ EpochNumberRequirement epoch_number_requirement)
: VersionEditHandler(read_only, column_families, version_set,
/*track_missing_files=*/true,
- /*no_error_if_files_missing=*/true, io_tracer) {}
+ /*no_error_if_files_missing=*/true, io_tracer,
+ epoch_number_requirement) {}
VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
for (const auto& elem : versions_) {
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_,
- version_set_->current_version_number_++);
+ version_set_->current_version_number_++,
+ epoch_number_requirement_);
s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
const std::vector<ColumnFamilyDescriptor>& column_families,
VersionSet* version_set, bool track_missing_files,
bool no_error_if_files_missing,
- const std::shared_ptr<IOTracer>& io_tracer)
+ const std::shared_ptr<IOTracer>& io_tracer,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent)
: VersionEditHandler(read_only, column_families, version_set,
track_missing_files, no_error_if_files_missing,
- io_tracer, /*skip_load_table_files=*/false) {}
+ io_tracer, /*skip_load_table_files=*/false,
+ epoch_number_requirement) {}
~VersionEditHandler() override {}
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, bool track_missing_files,
bool no_error_if_files_missing,
- const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
+ const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent);
Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
bool skip_load_table_files_;
bool initialized_;
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
+ EpochNumberRequirement epoch_number_requirement_;
private:
Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
public:
VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
- VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
+ VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent);
~VersionEditHandlerPointInTime() override;
protected:
public:
explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set,
- const std::shared_ptr<IOTracer>& io_tracer)
+ const std::shared_ptr<IOTracer>& io_tracer,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent)
: VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
- version_set, io_tracer),
+ version_set, io_tracer,
+ epoch_number_requirement),
mode_(Mode::kRecovery) {}
void PrepareToReadNewManifest() {
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
- kInvalidBlobFileNumber, 888, 678, "234", "crc32c",
+ kInvalidBlobFileNumber, 888, 678,
+ kBig + 300 + i /* epoch_number */, "234", "crc32c",
kNullUniqueId64x2);
edit.DeleteFile(4, kBig + 700 + i);
}
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 300 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 301 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
- 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 666, 888, 302 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
kBig + 603, true, Temperature::kUnknown, 1001,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 303 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.DeleteFile(4, 700);
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 300 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
- 686, 868, "234", "crc32c", kNullUniqueId64x2);
+ 686, 868, 301 /* epoch_number */, "234", "crc32c",
+ kNullUniqueId64x2);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 300 /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
Temperature::kUnknown, kInvalidBlobFileNumber,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName,
- kNullUniqueId64x2);
+ 1 /*epoch_number*/, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
std::string buffer;
ASSERT_TRUE(!edit.EncodeTo(&buffer));
}
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
file->being_compacted, file->temperature,
file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
- file->TryGetFileCreationTime(), file->file_checksum,
- file->file_checksum_func_name);
+ file->TryGetFileCreationTime(), file->epoch_number,
+ file->file_checksum, file->file_checksum_func_name);
files.back().num_entries = file->num_entries;
files.back().num_deletions = file->num_deletions;
level_size += file->fd.GetFileSize();
const InternalKeyComparator* internal_comparator,
const Comparator* user_comparator, int levels,
CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
- bool _force_consistency_checks)
+ bool _force_consistency_checks,
+ EpochNumberRequirement epoch_number_requirement)
: internal_comparator_(internal_comparator),
user_comparator_(user_comparator),
// cfd is nullptr if Version is dummy
current_num_samples_(0),
estimated_compaction_needed_bytes_(0),
finalized_(false),
- force_consistency_checks_(_force_consistency_checks) {
+ force_consistency_checks_(_force_consistency_checks),
+ epoch_number_requirement_(epoch_number_requirement) {
if (ref_vstorage != nullptr) {
accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
const FileOptions& file_opt,
const MutableCFOptions mutable_cf_options,
const std::shared_ptr<IOTracer>& io_tracer,
- uint64_t version_number)
+ uint64_t version_number,
+ EpochNumberRequirement epoch_number_requirement)
: env_(vset->env_),
clock_(vset->clock_),
cfd_(column_family_data),
(cfd_ == nullptr || cfd_->current() == nullptr)
? nullptr
: cfd_->current()->storage_info(),
- cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+ cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks,
+ epoch_number_requirement),
vset_(vset),
next_(this),
prev_(this),
return scratch->buffer;
}
+bool VersionStorageInfo::HasMissingEpochNumber() const {
+ for (int level = 0; level < num_levels_; ++level) {
+ for (const FileMetaData* f : files_[level]) {
+ if (f->epoch_number == kUnknownEpochNumber) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const {
+ uint64_t max_epoch_number = kUnknownEpochNumber;
+ for (int level = 0; level < num_levels_; ++level) {
+ for (const FileMetaData* f : files_[level]) {
+ max_epoch_number = std::max(max_epoch_number, f->epoch_number);
+ }
+ }
+ return max_epoch_number;
+}
+
+void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) {
+ cfd->ResetNextEpochNumber();
+
+ bool reserve_epoch_num_for_file_ingested_behind =
+ cfd->ioptions()->allow_ingest_behind;
+ if (reserve_epoch_num_for_file_ingested_behind) {
+ uint64_t reserved_epoch_number = cfd->NewEpochNumber();
+ assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind);
+ ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(),
+ "[%s]CF has reserved epoch number %" PRIu64
+ " for files ingested "
+ "behind since `Options::allow_ingest_behind` is true",
+ cfd->GetName().c_str(), reserved_epoch_number);
+ }
+
+ if (HasMissingEpochNumber()) {
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing);
+ assert(num_levels_ >= 1);
+
+ for (int level = num_levels_ - 1; level >= 1; --level) {
+ auto& files_at_level = files_[level];
+ if (files_at_level.empty()) {
+ continue;
+ }
+ uint64_t next_epoch_number = cfd->NewEpochNumber();
+ for (FileMetaData* f : files_at_level) {
+ f->epoch_number = next_epoch_number;
+ }
+ }
+
+ for (auto file_meta_iter = files_[0].rbegin();
+ file_meta_iter != files_[0].rend(); file_meta_iter++) {
+ FileMetaData* f = *file_meta_iter;
+ f->epoch_number = cfd->NewEpochNumber();
+ }
+
+ ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(),
+ "[%s]CF's epoch numbers are inferred based on seqno",
+ cfd->GetName().c_str());
+ epoch_number_requirement_ = EpochNumberRequirement::kMustPresent;
+ } else {
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent);
+ cfd->SetNextEpochNumber(
+ std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber()));
+ }
+}
+
uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
uint64_t result = 0;
std::vector<FileMetaData*> overlaps;
true /* checksum */, 0 /* log_number */);
VersionEditHandler handler(
read_only, column_families, const_cast<VersionSet*>(this),
- /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
+ /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
+ EpochNumberRequirement::kMightMissing);
handler.Iterate(reader, &log_read_status);
s = handler.status();
if (s.ok()) {
assert(current_manifest_file_size != 0);
handler.GetDbId(db_id);
}
+ if (s.ok()) {
+ RecoverEpochNumbers();
+ }
}
if (s.ok()) {
log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
/*checksum=*/true, /*log_num=*/0);
VersionEditHandlerPointInTime handler_pit(
- read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
+ read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
+ EpochNumberRequirement::kMightMissing);
handler_pit.Iterate(reader, &s);
assert(nullptr != has_missing_table_file);
*has_missing_table_file = handler_pit.HasMissingFiles();
- return handler_pit.status();
+ s = handler_pit.status();
+ if (s.ok()) {
+ RecoverEpochNumbers();
+ }
+ return s;
+}
+
+void VersionSet::RecoverEpochNumbers() {
+ for (auto cfd : *column_family_set_) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ assert(cfd->initialized());
+ cfd->RecoverEpochNumbers();
+ }
}
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction, f->temperature,
f->oldest_blob_file_number, f->oldest_ancester_time,
- f->file_creation_time, f->file_checksum,
+ f->file_creation_time, f->epoch_number, f->file_checksum,
f->file_checksum_func_name, f->unique_id);
}
}
filemetadata.temperature = file->temperature;
filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
filemetadata.file_creation_time = file->TryGetFileCreationTime();
+ filemetadata.epoch_number = file->epoch_number;
metadata->push_back(filemetadata);
}
}
log::Reader* reader = manifest_reader->get();
assert(reader);
- manifest_tailer_.reset(new ManifestTailer(
- column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
+ manifest_tailer_.reset(
+ new ManifestTailer(column_families, const_cast<ReactiveVersionSet*>(this),
+ io_tracer_, EpochNumberRequirement::kMightMissing));
manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
- return manifest_tailer_->status();
+ s = manifest_tailer_->status();
+ if (s.ok()) {
+ RecoverEpochNumbers();
+ }
+ return s;
}
Status ReactiveVersionSet::ReadAndApply(
extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
const std::vector<FileMetaData*>& files,
Arena* arena);
+enum EpochNumberRequirement {
+ kMightMissing,
+ kMustPresent,
+};
// Information of the storage associated with each Version, including number of
// levels of LSM tree, files information at each level, files marked for
const Comparator* user_comparator, int num_levels,
CompactionStyle compaction_style,
VersionStorageInfo* src_vstorage,
- bool _force_consistency_checks);
+ bool _force_consistency_checks,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent);
// No copying allowed
VersionStorageInfo(const VersionStorageInfo&) = delete;
void operator=(const VersionStorageInfo&) = delete;
return files_[level];
}
+ bool HasMissingEpochNumber() const;
+ uint64_t GetMaxEpochNumberOfFiles() const;
+ EpochNumberRequirement GetEpochNumberRequirement() const {
+ return epoch_number_requirement_;
+ }
+ void SetEpochNumberRequirement(
+ EpochNumberRequirement epoch_number_requirement) {
+ epoch_number_requirement_ = epoch_number_requirement;
+ }
+ void RecoverEpochNumbers(ColumnFamilyData* cfd);
+
class FileLocation {
public:
FileLocation() = default;
return files_marked_for_compaction_;
}
+ void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) {
+ f->marked_for_compaction = true;
+ files_marked_for_compaction_.emplace_back(level, f);
+ }
+
// REQUIRES: ComputeCompactionScore has been called
// REQUIRES: DB mutex held during access
const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
// is compiled in release mode
bool force_consistency_checks_;
+ EpochNumberRequirement epoch_number_requirement_;
+
friend class Version;
friend class VersionSet;
};
Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
MutableCFOptions mutable_cf_options,
const std::shared_ptr<IOTracer>& io_tracer,
- uint64_t version_number = 0);
+ uint64_t version_number = 0,
+ EpochNumberRequirement epoch_number_requirement =
+ EpochNumberRequirement::kMustPresent);
~Version();
const std::vector<ColumnFamilyDescriptor>& column_families,
bool read_only, std::string* db_id, bool* has_missing_table_file);
+ // Recover the next epoch number of each CFs and epoch number
+ // of their files (if missing)
+ void RecoverEpochNumbers();
+
// Reads a manifest file and returns a list of column families in
// column_families.
static Status ListColumnFamilies(std::vector<std::string>* column_families,
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h"
#include "db/log_writer.h"
+#include "db/version_edit.h"
#include "rocksdb/advanced_options.h"
#include "rocksdb/convenience.h"
#include "rocksdb/file_system.h"
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
- kUnknownFileCreationTime, kUnknownFileChecksum,
+ kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
files_.push_back(f);
}
/* largest_seq */ 0, /* marked_for_compact */ false,
Temperature::kUnknown, oldest_blob_file_number,
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+ kNullUniqueId64x2);
f->compensated_file_size = file_size;
vstorage_.AddFile(level, f);
}
std::string column_family;
std::string key; // the only key
int level = 0;
+ uint64_t epoch_number;
SstInfo(uint64_t file_num, const std::string& cf_name,
- const std::string& _key)
- : SstInfo(file_num, cf_name, _key, 0) {}
+ const std::string& _key,
+ uint64_t _epoch_number = kUnknownEpochNumber)
+ : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {}
SstInfo(uint64_t file_num, const std::string& cf_name,
- const std::string& _key, int lvl)
+ const std::string& _key, int lvl,
+ uint64_t _epoch_number = kUnknownEpochNumber)
: file_number(file_num),
column_family(cf_name),
key(_key),
- level(lvl) {}
+ level(lvl),
+ epoch_number(_epoch_number) {}
};
// Create dummy sst, return their metadata. Note that only file name and size
ASSERT_NE(0, file_size);
file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
- 0, kUnknownFileChecksum,
+ 0, info.epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2);
}
}
TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
std::vector<SstInfo> existing_files = {
- SstInfo(100, kDefaultColumnFamilyName, "a"),
- SstInfo(102, kDefaultColumnFamilyName, "b"),
- SstInfo(103, kDefaultColumnFamilyName, "c"),
- SstInfo(107, kDefaultColumnFamilyName, "d"),
- SstInfo(110, kDefaultColumnFamilyName, "e")};
+ SstInfo(100, kDefaultColumnFamilyName, "a", 100 /* epoch_number */),
+ SstInfo(102, kDefaultColumnFamilyName, "b", 102 /* epoch_number */),
+ SstInfo(103, kDefaultColumnFamilyName, "c", 103 /* epoch_number */),
+ SstInfo(107, kDefaultColumnFamilyName, "d", 107 /* epoch_number */),
+ SstInfo(110, kDefaultColumnFamilyName, "e", 110 /* epoch_number */)};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(existing_files, &file_metas);
std::string largest_ukey = "b";
InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+
FileMetaData meta = FileMetaData(
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ file_num /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
added_files.emplace_back(0, meta);
}
WriteFileAdditionAndDeletionToManifest(
TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
std::vector<SstInfo> existing_files = {
- SstInfo(100, kDefaultColumnFamilyName, "a"),
- SstInfo(102, kDefaultColumnFamilyName, "b"),
- SstInfo(103, kDefaultColumnFamilyName, "c"),
- SstInfo(107, kDefaultColumnFamilyName, "d"),
- SstInfo(110, kDefaultColumnFamilyName, "e")};
+ SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
+ 100 /* epoch_number */),
+ SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */,
+ 102 /* epoch_number */),
+ SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */,
+ 103 /* epoch_number */),
+ SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */,
+ 107 /* epoch_number */),
+ SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */,
+ 110 /* epoch_number */)};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(existing_files, &file_metas);
FileMetaData meta = FileMetaData(
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
- kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+ file_num /* epoch_number */, kUnknownFileChecksum,
+ kUnknownFileChecksumFuncName, kNullUniqueId64x2);
added_files.emplace_back(0, meta);
}
WriteFileAdditionAndDeletionToManifest(
TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
std::vector<SstInfo> existing_files = {
- SstInfo(100, kDefaultColumnFamilyName, "a"),
- SstInfo(102, kDefaultColumnFamilyName, "b"),
- SstInfo(103, kDefaultColumnFamilyName, "c"),
- SstInfo(107, kDefaultColumnFamilyName, "d"),
- SstInfo(110, kDefaultColumnFamilyName, "e")};
+ SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
+ 100 /* epoch_number */),
+ SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */,
+ 102 /* epoch_number */),
+ SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */,
+ 103 /* epoch_number */),
+ SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */,
+ 107 /* epoch_number */),
+ SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */,
+ 110 /* epoch_number */)};
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles(existing_files, &file_metas);
db_options_.allow_2pc = true;
NewDB();
- SstInfo sst(100, kDefaultColumnFamilyName, "a");
+ SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */,
+ 100 /* epoch_number */);
std::vector<FileMetaData> file_metas;
CreateDummyTableFiles({sst}, &file_metas);
bool _being_compacted, Temperature _temperature,
uint64_t _oldest_blob_file_number,
uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
- std::string& _file_checksum,
+ uint64_t _epoch_number, std::string& _file_checksum,
std::string& _file_checksum_func_name)
: smallest_seqno(_smallest_seqno),
largest_seqno(_largest_seqno),
num_deletions(0),
oldest_blob_file_number(_oldest_blob_file_number),
oldest_ancester_time(_oldest_ancester_time),
- file_creation_time(_file_creation_time) {
+ file_creation_time(_file_creation_time),
+ epoch_number(_epoch_number) {
if (!_file_name.empty()) {
if (_file_name[0] == '/') {
relative_filename = _file_name.substr(1);
// Timestamp when the SST file is created, provided by
// SystemClock::GetCurrentTime(). 0 if the information is not available.
uint64_t file_creation_time = 0;
-
+ // The order of a file being flushed or ingested/imported.
+ // Compaction output file will be assigned with the minimum `epoch_number`
+ // among input files'.
+ // For L0, larger `epoch_number` indicates newer L0 file.
+ // 0 if the information is not available.
+ uint64_t epoch_number = 0;
// DEPRECATED: The name of the file within its directory with a
// leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
// instead.
live_file_metadata.largestkey = std::move(file_metadata.largestkey);
live_file_metadata.oldest_blob_file_number =
file_metadata.oldest_blob_file_number;
+ live_file_metadata.epoch_number = file_metadata.epoch_number;
live_file_metadata.level = level_metadata.level;
result_metadata->files.push_back(live_file_metadata);
}