extra_compiler_flags=[])
+cpp_unittest_wrapper(name="db_etc3_test",
+ srcs=["db/db_etc3_test.cc"],
+ deps=[":rocksdb_test_lib"],
+ extra_compiler_flags=[])
+
+
cpp_unittest_wrapper(name="db_flush_test",
srcs=["db/db_flush_test.cc"],
deps=[":rocksdb_test_lib"],
db/db_clip_test.cc
db/db_dynamic_level_test.cc
db/db_encryption_test.cc
+ db/db_etc3_test.cc
db/db_flush_test.cc
db/db_inplace_update_test.cc
db/db_io_failure_test.cc
db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
+db_etc3_test: $(OBJ_DIR)/db/db_etc3_test.o $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
compression_test: $(OBJ_DIR)/util/compression_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
table_cache_(NewLRUCache(50000, 16)),
write_buffer_manager_(db_options_.db_write_buffer_size),
versions_(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr,
/*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
/*daily_offpeak_time_utc=*/"",
ASSERT_OK(s);
db_options_.info_log = info_log;
- versions_.reset(
- new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
- /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
- test::kUnitTestDbId, /*db_session_id=*/"",
- /*daily_offpeak_time_utc=*/"",
- /*error_handler=*/nullptr, /*unchanging=*/false));
+ versions_.reset(new VersionSet(
+ dbname_, &db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ test::kUnitTestDbId, /*db_session_id=*/"",
+ /*daily_offpeak_time_utc=*/"",
+ /*error_handler=*/nullptr, /*unchanging=*/false));
compaction_job_stats_.Reset();
VersionEdit new_db;
} while (ChangeCompactOptions());
}
-TEST_F(DBBasicTest, ManifestRollOver) {
- do {
- Options options;
- options.max_manifest_file_size = 10; // 10 bytes
- options = CurrentOptions(options);
- CreateAndReopenWithCF({"pikachu"}, options);
- {
- ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
- ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
- ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
- uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
- ASSERT_OK(Flush(1)); // This should trigger LogAndApply.
- uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
- ASSERT_GT(manifest_after_flush, manifest_before_flush);
- ReopenWithColumnFamilies({"default", "pikachu"}, options);
- ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
- // check if a new manifest file got inserted or not.
- ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
- ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
- ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
- }
- } while (ChangeCompactOptions());
-}
-
TEST_F(DBBasicTest, IdentityAcrossRestarts) {
constexpr size_t kMinIdSize = 10;
do {
--- /dev/null
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEtc3Test : public DBTestBase {
+ public:
+ DBEtc3Test() : DBTestBase("db_etc3_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBEtc3Test, ManifestRollOver) {
+ do {
+ Options options;
+ // Force new manifest on each manifest write
+ options.max_manifest_file_size = 0;
+ options.max_manifest_space_amp_pct = 0;
+ options = CurrentOptions(options);
+ CreateAndReopenWithCF({"pikachu"}, options);
+ {
+ ASSERT_OK(Put(1, "key1", std::string(1000, '1')));
+ ASSERT_OK(Put(1, "key2", std::string(1000, '2')));
+ ASSERT_OK(Put(1, "key3", std::string(1000, '3')));
+ uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_OK(Flush(1)); // This should trigger LogAndApply.
+ uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+ ASSERT_GT(manifest_after_flush, manifest_before_flush);
+ // Re-open should always re-create manifest file
+ ReopenWithColumnFamilies({"default", "pikachu"}, options);
+ ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+ ASSERT_EQ(std::string(1000, '1'), Get(1, "key1"));
+ ASSERT_EQ(std::string(1000, '2'), Get(1, "key2"));
+ ASSERT_EQ(std::string(1000, '3'), Get(1, "key3"));
+ }
+ } while (ChangeCompactOptions());
+}
+
+TEST_F(DBEtc3Test, AutoTuneManifestSize) {
+ // Ensure we have auto-tuning beyond max_manifest_file_size by default
+ ASSERT_EQ(DBOptions{}.max_manifest_space_amp_pct, 500);
+
+ Options options = CurrentOptions();
+ ASSERT_OK(db_->SetOptions({{"level0_file_num_compaction_trigger", "20"}}));
+
+ // Use large column family names to essentially control the amount of payload
+ // data needed for the manifest file. Drop manifest entries don't include the
+ // CF name so are small.
+ uint64_t prev_manifest_num = 0, cur_manifest_num = 0;
+ std::deque<ColumnFamilyHandle*> handles;
+ int counter = 5;
+ auto AddCfFn = [&]() {
+ std::string name = "cf" + std::to_string(counter++);
+ name.resize(1000, 'a');
+ ASSERT_OK(db_->CreateColumnFamily(options, name, &handles.emplace_back()));
+ prev_manifest_num = cur_manifest_num;
+ cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+ };
+ auto DropCfFn = [&]() {
+ ASSERT_OK(db_->DropColumnFamily(handles.front()));
+ ASSERT_OK(db_->DestroyColumnFamilyHandle(handles.front()));
+ handles.pop_front();
+ prev_manifest_num = cur_manifest_num;
+ cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+ };
+ auto TrivialManifestWriteFn = [&]() {
+ ASSERT_OK(Put("x", std::to_string(counter++)));
+ ASSERT_OK(Flush());
+ prev_manifest_num = cur_manifest_num;
+ cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+ };
+
+ options.max_manifest_file_size = 1000000;
+ options.max_manifest_space_amp_pct = 0; // no auto-tuning yet
+ DestroyAndReopen(options);
+
+ // With the generous (minimum) maximum manifest size, should not be rotated
+ AddCfFn();
+ AddCfFn();
+ AddCfFn();
+ ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+
+ // Change options for small max and (still) no auto-tuning
+ ASSERT_OK(db_->SetDBOptions({{"max_manifest_file_size", "3000"}}));
+
+ // Takes effect on the next manifest write
+ TrivialManifestWriteFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+ // Now we have to rewrite the whole manifest on each write because the
+ // compacted size exceeds the "max" size.
+ AddCfFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+ DropCfFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+ AddCfFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+ TrivialManifestWriteFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+ // Enabling auto-tuning should fix this, immediately for next manifest writes.
+ // This will allow up to double-ish the size of the compacted manifest,
+ // which last should have been 4000 + some bytes.
+ ASSERT_EQ(handles.size(), 4U);
+ ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "105"}}));
+
+ // After 9 CF names should be enough to rotate the manifest
+ for (int i = 1; i <= 5; ++i) {
+ if ((i % 2) == 1) {
+ DropCfFn();
+ }
+ AddCfFn();
+ ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+ }
+ TrivialManifestWriteFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+ // We now have a different last compacted manifest size, should be
+ // able to go beyond 9 CFs named in manifest this time.
+ ASSERT_EQ(handles.size(), 6U);
+
+ DropCfFn();
+ DropCfFn();
+ for (int i = 1; i <= 4; ++i) {
+ DropCfFn();
+ AddCfFn();
+ ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+ }
+ // We've written 10 named CFs to the manifest. We should be able to
+ // dynamically change the auto-tuning still based on the last "compacted"
+ // manifest size of 7000 + some bytes.
+ ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "51"}}));
+ TrivialManifestWriteFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+ // And the "compacted" manifest size has reset again, so should be changed
+ // again sooner.
+ ASSERT_EQ(handles.size(), 4U);
+ for (int i = 1; i <= 2; ++i) {
+ AddCfFn();
+ ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+ }
+ // Enough for manifest change
+ AddCfFn();
+ ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+ // Wrap up
+ while (!handles.empty()) {
+ DropCfFn();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ RegisterCustomObjects(argc, argv);
+ return RUN_ALL_TESTS();
+}
[this]() { this->TriggerPeriodicCompaction(); });
versions_.reset(new VersionSet(
- dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
- write_buffer_manager_, &write_controller_, &block_cache_tracer_,
- io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc,
- &error_handler_, read_only));
+ dbname_, &immutable_db_options_, mutable_db_options_, file_options_,
+ table_cache_.get(), write_buffer_manager_, &write_controller_,
+ &block_cache_tracer_, io_tracer_, db_id_, db_session_id_,
+ options.daily_offpeak_time_utc, &error_handler_, read_only));
column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
file_options_for_compaction_ = FileOptions(new_db_options);
file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
file_options_for_compaction_, immutable_db_options_);
- versions_->ChangeFileOptions(mutable_db_options_);
+ versions_->UpdatedMutableDbOptions(mutable_db_options_, &mutex_);
// TODO(xiez): clarify why apply optimize for read to write options
file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
file_options_for_compaction_, immutable_db_options_);
DBImplFollower* impl =
new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
impl->versions_.reset(new ReactiveVersionSet(
- dbname, &impl->immutable_db_options_, impl->file_options_,
- impl->table_cache_.get(), impl->write_buffer_manager_,
- &impl->write_controller_, impl->io_tracer_));
+ dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+ impl->file_options_, impl->table_cache_.get(),
+ impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
impl->column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
}
FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
file->SetPreallocationBlockSize(
- immutable_db_options_.manifest_preallocation_size);
+ mutable_db_options_.manifest_preallocation_size);
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
std::move(file), manifest, file_options, immutable_db_options_.clock,
io_tracer_, nullptr /* stats */,
handles->clear();
DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
impl->versions_.reset(new ReactiveVersionSet(
- dbname, &impl->immutable_db_options_, impl->file_options_,
- impl->table_cache_.get(), impl->write_buffer_manager_,
- &impl->write_controller_, impl->io_tracer_));
+ dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+ impl->file_options_, impl->table_cache_.get(),
+ impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
impl->column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
ASSERT_GT(low_bytes_per_sync, counter);
}
+TEST_F(DBOptionsTest, MutableManifestOptions) {
+ // These aren't end-to-end tests, but sufficient to ensure the VersionSet
+ // receives the updates with SetDBOptions
+ for (int64_t i : {0, 1, 100, 100000, 10000000}) {
+ ASSERT_OK(
+ db_->SetDBOptions({{"max_manifest_file_size", std::to_string(i)}}));
+ ASSERT_EQ(i,
+ static_cast<int64_t>(db_->GetDBOptions().max_manifest_file_size));
+ ASSERT_EQ(i,
+ static_cast<int64_t>(
+ dbfull()->GetVersionSet()->TEST_GetMinMaxManifestFileSize()));
+ if (i > 1) {
+ ++i;
+ }
+ ASSERT_OK(
+ db_->SetDBOptions({{"max_manifest_space_amp_pct", std::to_string(i)}}));
+ ASSERT_EQ(i, static_cast<int64_t>(
+ db_->GetDBOptions().max_manifest_space_amp_pct));
+ ASSERT_EQ(i,
+ static_cast<int64_t>(
+ dbfull()->GetVersionSet()->TEST_GetMaxManifestSpaceAmpPct()));
+ if (i > 1) {
+ ++i;
+ }
+ ASSERT_OK(db_->SetDBOptions(
+ {{"manifest_preallocation_size", std::to_string(i)}}));
+ ASSERT_EQ(i, static_cast<int64_t>(
+ db_->GetDBOptions().manifest_preallocation_size));
+ ASSERT_EQ(
+ i, static_cast<int64_t>(
+ dbfull()->GetVersionSet()->TEST_GetManifestPreallocationSize()));
+ }
+}
+
TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
Options options;
options.create_if_missing = true;
options.writable_file_max_buffer_size = 1024 * 1024;
options.level0_file_num_compaction_trigger = 3;
options.max_manifest_file_size = 1;
+ options.max_manifest_space_amp_pct = 0;
options.env = env_;
int buffer_size = 1024 * 1024;
Reopen(options);
Options options = CurrentOptions();
DestroyAndReopen(options);
options.max_manifest_file_size = 10;
+ options.max_manifest_space_amp_pct = 0;
options.create_if_missing = true;
CreateAndReopenWithCF({"pikachu"}, options);
ASSERT_EQ(2, handles_.size());
Destroy(last_options_);
Options options = GetDefaultOptions();
options.max_manifest_file_size = 1;
+ options.max_manifest_space_amp_pct = 0;
options.create_if_missing = true;
Reopen(options);
ASSERT_OK(Put("key", "value"));
Destroy(last_options_);
Options options = GetDefaultOptions();
options.max_manifest_file_size = 1;
+ options.max_manifest_space_amp_pct = 0;
options.create_if_missing = true;
Reopen(options);
ASSERT_OK(Put("a", "a_value"));
options.allow_mmap_reads = can_allow_mmap;
break;
case kManifestFileSize:
- options.max_manifest_file_size = 50; // 50 bytes
+ options.max_manifest_file_size = 50; // 50 bytes
+ options.max_manifest_space_amp_pct = 0; // old behavior
break;
case kPerfOptions:
options.delayed_write_rate = 8 * 1024 * 1024;
WriteController write_controller;
versions.reset(new VersionSet(
- test->dbname_, &db_options, file_options, table_cache.get(),
- &write_buffer_manager, &write_controller,
+ test->dbname_, &db_options, MutableDBOptions{options}, file_options,
+ table_cache.get(), &write_buffer_manager, &write_controller,
/*block_cache_tracer=*/nullptr,
/*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
options.daily_offpeak_time_utc,
Options options = CurrentOptions();
// Small size to force manifest creation
options.max_manifest_file_size = 1;
+ options.max_manifest_space_amp_pct = 0;
options.track_and_verify_wals_in_manifest = true;
DestroyAndReopen(options);
column_families.emplace_back(cf_name, cf_options_);
}
- versions_.reset(
- new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
- /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
- test::kUnitTestDbId, /*db_session_id=*/"",
- /*daily_offpeak_time_utc=*/"",
- /*error_handler=*/nullptr, /*read_only=*/false));
+ versions_.reset(new VersionSet(
+ dbname_, &db_options_, MutableDBOptions{options_}, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
+ /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+ test::kUnitTestDbId, /*db_session_id=*/"",
+ /*daily_offpeak_time_utc=*/"",
+ /*error_handler=*/nullptr, /*read_only=*/false));
EXPECT_OK(versions_->Recover(column_families, false));
}
WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
WriteController write_controller(10000000u);
- VersionSet versions(dbname, &immutable_db_options, env_options,
+ VersionSet versions(dbname, &immutable_db_options,
+ MutableDBOptions{db_options}, env_options,
table_cache.get(), &write_buffer_manager,
&write_controller, /*block_cache_tracer=*/nullptr,
/*io_tracer=*/nullptr, /*db_id=*/"",
WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
WriteController write_controller(10000000u);
- VersionSet versions(dbname, &immutable_db_options, env_options,
+ VersionSet versions(dbname, &immutable_db_options,
+ MutableDBOptions{db_options}, env_options,
table_cache.get(), &write_buffer_manager,
&write_controller, /*block_cache_tracer=*/nullptr,
/*io_tracer=*/nullptr, /*db_id=*/"",
/*io_tracer=*/nullptr, db_session_id_)),
wb_(db_options_.db_write_buffer_size),
wc_(db_options_.delayed_write_rate),
- vset_(dbname_, &immutable_db_options_, file_options_,
- raw_table_cache_.get(), &wb_, &wc_,
+ vset_(dbname_, &immutable_db_options_, MutableDBOptions{db_options_},
+ file_options_, raw_table_cache_.get(), &wb_, &wc_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc,
/*error_handler=*/nullptr, /*read_only=*/false),
VersionSet::VersionSet(
const std::string& dbname, const ImmutableDBOptions* _db_options,
+ const MutableDBOptions& mutable_db_options,
const FileOptions& storage_options, Cache* table_cache,
WriteBufferManager* write_buffer_manager, WriteController* write_controller,
BlockCacheTracer* const block_cache_tracer,
prev_log_number_(0),
current_version_number_(0),
manifest_file_size_(0),
+ last_compacted_manifest_file_size_(0),
file_options_(storage_options),
block_cache_tracer_(block_cache_tracer),
io_tracer_(io_tracer),
offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
error_handler_(error_handler),
unchanging_(unchanging),
- closed_(false) {}
+ closed_(false) {
+ UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr);
+}
Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
Status s;
current_version_number_ = 0;
manifest_writers_.clear();
manifest_file_size_ = 0;
+ last_compacted_manifest_file_size_ = 0;
+ TuneMaxManifestFileSize();
obsolete_files_.clear();
obsolete_manifests_.clear();
wals_.Reset();
}
+void VersionSet::UpdatedMutableDbOptions(
+ const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
+ // Must be holding mutex if not called during initialization
+ if (manifest_file_size_ > 0) {
+ mu->AssertHeld();
+ }
+ file_options_.writable_file_max_buffer_size =
+ updated_options.writable_file_max_buffer_size;
+ min_max_manifest_file_size_ = updated_options.max_manifest_file_size;
+ max_manifest_space_amp_pct_ = static_cast<unsigned>(
+ std::max(updated_options.max_manifest_space_amp_pct, 0));
+ manifest_preallocation_size_ = updated_options.manifest_preallocation_size;
+ TuneMaxManifestFileSize();
+}
+
+void VersionSet::TuneMaxManifestFileSize() {
+ tuned_max_manifest_file_size_ =
+ std::max(min_max_manifest_file_size_,
+ last_compacted_manifest_file_size_ *
+ (100U + max_manifest_space_amp_pct_) / 100U);
+}
+
void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
Version* v) {
// compute new compaction score
}
#endif // NDEBUG
+ uint64_t prev_manifest_file_size = manifest_file_size_;
assert(pending_manifest_file_number_ == 0);
if (!skip_manifest_write &&
(!descriptor_log_ ||
- manifest_file_size_ > db_options_->max_manifest_file_size)) {
+ prev_manifest_file_size >= tuned_max_manifest_file_size_)) {
TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
new_descriptor_log = true;
} else {
IOStatus manifest_io_status;
manifest_io_status.PermitUncheckedError();
std::unique_ptr<log::Writer> new_desc_log_ptr;
+ // Save before releasing mu
+ uint64_t manifest_preallocation_size = manifest_preallocation_size_;
if (skip_manifest_write) {
if (s.ok()) {
constexpr bool update_stats = true;
// This is fine because everything inside of this block is serialized --
// only one thread can be here at the same time
// create new manifest file
- ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
- pending_manifest_file_number_);
std::string descriptor_fname =
DescriptorFileName(dbname_, pending_manifest_file_number_);
std::unique_ptr<FSWritableFile> descriptor_file;
io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
opt_file_opts);
if (io_s.ok()) {
- descriptor_file->SetPreallocationBlockSize(
- db_options_->manifest_preallocation_size);
+ descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size);
FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
if (s.ok()) {
// find offset in manifest file where this version is stored.
new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
+ if (new_descriptor_log) {
+ ROCKS_LOG_INFO(db_options_->info_log,
+ "Created manifest %" PRIu64
+ ", compacted+appended from %" PRIu64 " to %" PRIu64 "\n",
+ pending_manifest_file_number_, prev_manifest_file_size,
+ new_manifest_file_size);
+ }
}
if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
descriptor_log_ = std::move(new_desc_log_ptr);
obsolete_manifests_.emplace_back(
DescriptorFileName("", manifest_file_number_));
+ last_compacted_manifest_file_size_ = new_manifest_file_size;
+ TuneMaxManifestFileSize();
}
// Install the new versions
const ReadOptions read_options;
const WriteOptions write_options;
- ImmutableDBOptions db_options(*options);
+ ImmutableDBOptions imm_db_options(*options);
+ MutableDBOptions mutable_db_options(*options);
ColumnFamilyOptions cf_options(*options);
std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
options->table_cache_numshardbits));
WriteController wc(options->delayed_write_rate);
WriteBufferManager wb(options->db_write_buffer_size);
- VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
- nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+ VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options,
+ tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/,
+ nullptr /*IOTracer*/,
/*db_id*/ "",
/*db_session_id*/ "", options->daily_offpeak_time_utc,
/*error_handler_*/ nullptr, /*unchanging=*/false);
}
ReactiveVersionSet::ReactiveVersionSet(
- const std::string& dbname, const ImmutableDBOptions* _db_options,
+ const std::string& dbname, const ImmutableDBOptions* imm_db_options,
+ const MutableDBOptions& mutable_db_options,
const FileOptions& _file_options, Cache* table_cache,
WriteBufferManager* write_buffer_manager, WriteController* write_controller,
const std::shared_ptr<IOTracer>& io_tracer)
- : VersionSet(dbname, _db_options, _file_options, table_cache,
- write_buffer_manager, write_controller,
+ : VersionSet(dbname, imm_db_options, mutable_db_options, _file_options,
+ table_cache, write_buffer_manager, write_controller,
/*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
/*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
/*error_handler=*/nullptr, /*unchanging=*/false) {}
// but false for secondary instance or writable DB).
class VersionSet {
public:
- VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+ VersionSet(const std::string& dbname,
+ const ImmutableDBOptions* imm_db_options,
+ const MutableDBOptions& mutable_db_options,
const FileOptions& file_options, Cache* table_cache,
WriteBufferManager* write_buffer_manager,
WriteController* write_controller,
virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu);
+ // Requires: already holding DB mutex `mu`, to ensure
+ // * Safely read values from `updated_options`
+ // * Safely update fields on `this` (must be read elsewhere while holding mu)
+ // except `mu` can be nullptr during initialization
+ void UpdatedMutableDbOptions(const MutableDBOptions& updated_options,
+ InstrumentedMutex* mu);
+
Status LogAndApplyToDefaultColumnFamily(
const ReadOptions& read_options, const WriteOptions& write_options,
VersionEdit* edit, InstrumentedMutex* mu,
}
const FileOptions& file_options() { return file_options_; }
- void ChangeFileOptions(const MutableDBOptions& new_options) {
- file_options_.writable_file_max_buffer_size =
- new_options.writable_file_max_buffer_size;
- }
// TODO - Consider updating together when file options change in SetDBOptions
const OffpeakTimeOption& offpeak_time_option() {
bool& TEST_unchanging() { return const_cast<bool&>(unchanging_); }
+ uint64_t TEST_GetMinMaxManifestFileSize() {
+ return min_max_manifest_file_size_;
+ }
+ unsigned TEST_GetMaxManifestSpaceAmpPct() {
+ return max_manifest_space_amp_pct_;
+ }
+ size_t TEST_GetManifestPreallocationSize() {
+ return manifest_preallocation_size_;
+ }
+
protected:
struct ManifestWriter;
}
};
+ // Revert back to a post-construction state (keep same options/settings)
void Reset();
// Returns approximated offset of a key in a file for a given version.
ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& meta);
+ // Auto-tune next max size for the current manifest file based on its initial
+ // "compacted" size and other parameters saved in this VersionSet. Must be
+ // holding DB mutex if outside of DB startup.
+ void TuneMaxManifestFileSize();
+
// Protected by DB mutex.
WalSet wals_;
// Current size of manifest file
uint64_t manifest_file_size_;
+ // Size of the populated manifest file last time it was re-written from
+ // scratch.
+ uint64_t last_compacted_manifest_file_size_;
+
+ // Auto-tuned max allowed size for the current manifest file
+ uint64_t tuned_max_manifest_file_size_;
+
+ // Saved copy of max_manifest_file_size in (Mutable)DBOptions
+ uint64_t min_max_manifest_file_size_;
+ // Saved, sanitized copy from (Mutable)DBOptions
+ unsigned max_manifest_space_amp_pct_;
+ // Saved copy from (Mutable)DBOptions
+ size_t manifest_preallocation_size_;
+
// Obsolete files, or during DB shutdown any files not referenced by what's
// left of the in-memory LSM state.
std::vector<ObsoleteFileInfo> obsolete_files_;
public:
ReactiveVersionSet(const std::string& dbname,
const ImmutableDBOptions* _db_options,
+ const MutableDBOptions& mutable_db_options,
const FileOptions& _file_options, Cache* table_cache,
WriteBufferManager* write_buffer_manager,
WriteController* write_controller,
: env_(nullptr),
dbname_(test::PerThreadDBPath(name)),
options_(),
- db_options_(options_),
+ imm_db_options_(options_),
cf_options_(options_),
- immutable_options_(db_options_, cf_options_),
+ immutable_options_(imm_db_options_, cf_options_),
mutable_cf_options_(cf_options_),
table_cache_(NewLRUCache(50000, 16)),
- write_buffer_manager_(db_options_.db_write_buffer_size),
+ write_buffer_manager_(imm_db_options_.db_write_buffer_size),
shutting_down_(false),
table_factory_(std::make_shared<mock::MockTableFactory>()) {
EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
options_.env = env_;
- db_options_.env = env_;
- db_options_.fs = fs_;
+ imm_db_options_.env = env_;
+ imm_db_options_.fs = fs_;
immutable_options_.env = env_;
immutable_options_.fs = fs_;
immutable_options_.clock = env_->GetSystemClock().get();
mutable_cf_options_.table_factory = table_factory_;
versions_.reset(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false));
reactive_versions_ = std::make_shared<ReactiveVersionSet>(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_, nullptr);
- db_options_.db_paths.emplace_back(dbname_,
- std::numeric_limits<uint64_t>::max());
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
+ nullptr);
+ imm_db_options_.db_paths.emplace_back(dbname_,
+ std::numeric_limits<uint64_t>::max());
}
virtual ~VersionSetTestBase() {
ASSERT_OK(
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
VersionEdit new_db;
- if (db_options_.write_dbid_to_manifest) {
+ if (imm_db_options_.write_dbid_to_manifest) {
DBOptions tmp_db_options;
tmp_db_options.env = env_;
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
void ReopenDB() {
versions_.reset(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false));
const std::string dbname_;
EnvOptions env_options_;
Options options_;
- ImmutableDBOptions db_options_;
+ ImmutableDBOptions imm_db_options_;
+ MutableDBOptions mutable_db_options_;
ColumnFamilyOptions cf_options_;
ImmutableOptions immutable_options_;
MutableCFOptions mutable_cf_options_;
// Recover a new VersionSet.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// Recover a new VersionSet.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// Recover a new VersionSet, only the non-closed WAL should show up.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// Recover from the new MANIFEST, only the non-closed WAL should show up.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// Recover a new VersionSet, all WALs are deleted.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
// kept.
{
std::unique_ptr<VersionSet> new_versions(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
std::unique_ptr<VersionSet> vset(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*unchanging=*/false));
std::unique_ptr<log::Writer>* log_writer) override {
assert(nullptr != log_writer);
VersionEdit new_db;
- if (db_options_.write_dbid_to_manifest) {
+ if (imm_db_options_.write_dbid_to_manifest) {
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
Temperature::kUnknown));
DBOptions tmp_db_options;
const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
- db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
log_writer_.reset();
CreateCurrentFile();
}
TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
- db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
// Only a subset of column families in the MANIFEST.
VersionEdit new_cf1;
}
TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
- db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
// Write all column families but no log_number, next_file_number and
// last_sequence.
}
TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
- db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
// Write all column families but no log_number, next_file_number and
// last_sequence.
}
TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
- db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+ imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
PrepareManifest(nullptr, nullptr, &log_writer_);
// Write all column families but no log_number, next_file_number and
// last_sequence.
ASSERT_OK(s);
log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
VersionEdit new_db;
- if (db_options_.write_dbid_to_manifest) {
+ if (imm_db_options_.write_dbid_to_manifest) {
DBOptions tmp_db_options;
tmp_db_options.env = env_;
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
}
TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
- db_options_.allow_2pc = true;
+ imm_db_options_.allow_2pc = true;
NewDB();
SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */,
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
immutable_db_options_(WithDbPath(options, db_path)),
tc_(NewLRUCache(1 << 20 /* capacity */,
options.table_cache_numshardbits)),
- versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+ versions_(db_path, &immutable_db_options_, MutableDBOptions{options},
+ sopt_, tc_.get(), &wb_, &wc_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"",
options.daily_offpeak_time_utc,
db_options_.clock = env_->GetSystemClock().get();
versions_.reset(new VersionSet(
- dbname_, &db_options_, env_options_, table_cache_.get(),
- &write_buffer_manager_, &write_controller_,
+ dbname_, &db_options_, MutableDBOptions{}, env_options_,
+ table_cache_.get(), &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false));
DECLARE_uint64(ops_per_thread);
DECLARE_uint64(log2_keys_per_lock);
DECLARE_uint64(max_manifest_file_size);
+DECLARE_int32(max_manifest_space_amp_pct);
DECLARE_bool(in_place_update);
DECLARE_string(memtablerep);
DECLARE_int32(prefix_size);
static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
-DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+DEFINE_uint64(max_manifest_file_size, 16384,
+ "Maximum size of a MANIFEST file (without auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct, 500,
+ "Max manifest space amp percentage for auto-tuning");
DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
options.compression_opts.checksum = true;
}
options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+ options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
options.allow_concurrent_memtable_write =
FLAGS_allow_concurrent_memtable_write;
// Default: 0
size_t recycle_log_file_num = 0;
- // manifest file is rolled over on reaching this limit.
- // The older manifest file be deleted.
- // The default value is 1GB so that the manifest file can grow, but not
- // reach the limit of storage capacity.
+ // The manifest file is rolled over on reaching this limit AND the
+ // space amp limit described in max_manifest_space_amp_pct. More trade-off
+ // details there.
+ //
+ // NOTE: this option used to be a hard limit, but that made this a dangerous
+ // tuning parameter for optimizing manifest file size because the best
+ // size really depends on the DB size and average SST file size (and other
+ // settings). Now it is essentially a minimum for the auto-tuned max manifest
+ // file size.
+ //
+ // Until the max_manifest_space_amp_pct feature is fully validated to show a
+ // smaller default here like 1MB is appropriate, the default value is 1GB to
+ // match historical behavior (without it being a hard limit in case of giant
+ // compacted manifest size).
+ //
+ // This option is mutable with SetDBOptions(), taking effect on the next
+ // manifest write (e.g. completed DB compaction or flush).
uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
+ // This option mostly replaces max_manifest_file_size to control an auto-tuned
+ // balance of manifest write amplification and space amplification. A new
+ // manifest file is created with the "compacted" contents of the old one when
+ // current_manifest_size
+ // >
+ // max(max_manifest_file_size,
+ // est_compacted_manifest_size * (1 + max_manifest_space_amp_pct/100))
+ //
+ // where est_compacted_manifest_size is an estimate of how big a new compacted
+ // version of the current manifest would be. Currently, the estimate used is
+ // the last newly-written manifest, in its "compacted" form.
+ //
+ // Space amplification in the manifest file might be less of a concern for
+ // primary storage space and more of a concern for DB recover time and size of
+ // backup files that aren't incremental between backups. To minimize manifest
+ // churn on initial DB population, setting max_manifest_file_size to something
+ // not too small, like 1MB, should suffice. Similarly, write amp on the
+ // manifest file is likely not a direct concern but completed compactions and
+ // flushes cannot (currently) be committed while the (relatively small)
+ // manifest file is being compacted. Manifest compactions should not
+ // interfere with user write latency or throughput unless the DB is
+ // chronically stalling or close to stalling writes already.
+ //
+ // For this option to have a meaningful effect, it is recommended to set
+ // max_manifest_file_size to something modest like 1MB. Then we can interpret
+ // values for this option as follows, starting with minimum space amp and
+ // maximum write amp:
+ // * 0 - Every manifest write (flush, compaction, etc.) generates a whole new
+ // manifest. Only useful for testing.
+ // * very small - Doesn't take many manifest writes to generate a whole new
+ // manifest.
+ // * 100 - In a DB with pretty consistent number of SST files, etc., achieves
+ // about 1.0 write amp (writing about 2x the theoretical minimum) and a max of
+ // about 1.0 space amp (manifest up to 2x the compacted size).
+ // * 500 - Recommended and default: 0.2 write amp and up to roughly 5.0 space
+ // amp.
+ // * 10000 - 0.01 write amp and up to 100 space amp on the manifest.
+ //
+ // This option is mutable with SetDBOptions(), taking effect on the next
+ // manifest write (e.g. completed DB compaction or flush).
+ int max_manifest_space_amp_pct = 500;
+
// Number of shards used for table cache.
int table_cache_numshardbits = 6;
{offsetof(struct MutableDBOptions, max_background_flushes),
OptionType::kInt, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
+ {"max_manifest_file_size",
+ {offsetof(struct MutableDBOptions, max_manifest_file_size),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"max_manifest_space_amp_pct",
+ {offsetof(struct MutableDBOptions, max_manifest_space_amp_pct),
+ OptionType::kInt, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"manifest_preallocation_size",
+ {offsetof(struct MutableDBOptions, manifest_preallocation_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
{"daily_offpeak_time_utc",
{offsetof(struct MutableDBOptions, daily_offpeak_time_utc),
OptionType::kString, OptionVerificationType::kNormal,
{offsetof(struct ImmutableDBOptions, log_file_time_to_roll),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
- {"manifest_preallocation_size",
- {offsetof(struct ImmutableDBOptions, manifest_preallocation_size),
- OptionType::kSizeT, OptionVerificationType::kNormal,
- OptionTypeFlags::kNone}},
{"max_log_file_size",
{offsetof(struct ImmutableDBOptions, max_log_file_size),
OptionType::kSizeT, OptionVerificationType::kNormal,
{offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
- {"max_manifest_file_size",
- {offsetof(struct ImmutableDBOptions, max_manifest_file_size),
- OptionType::kUInt64T, OptionVerificationType::kNormal,
- OptionTypeFlags::kNone}},
{"persist_stats_to_disk",
{offsetof(struct ImmutableDBOptions, persist_stats_to_disk),
OptionType::kBoolean, OptionVerificationType::kNormal,
explicit DBOptionsConfigurable(
const DBOptions& opts,
const std::unordered_map<std::string, std::string>* map = nullptr)
- : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) {
+ : MutableDBConfigurable(MutableDBOptions{opts}, map), db_options_(opts) {
// The ImmutableDBOptions currently requires the env to be non-null. Make
// sure it is
if (opts.env != nullptr) {
return ptr;
}
-ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {}
+ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(DBOptions{}) {}
ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
: create_if_missing(options.create_if_missing),
log_file_time_to_roll(options.log_file_time_to_roll),
keep_log_file_num(options.keep_log_file_num),
recycle_log_file_num(options.recycle_log_file_num),
- max_manifest_file_size(options.max_manifest_file_size),
table_cache_numshardbits(options.table_cache_numshardbits),
WAL_ttl_seconds(options.WAL_ttl_seconds),
WAL_size_limit_MB(options.WAL_size_limit_MB),
max_write_batch_group_size_bytes(
options.max_write_batch_group_size_bytes),
- manifest_preallocation_size(options.manifest_preallocation_size),
allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes),
use_direct_reads(options.use_direct_reads),
ROCKS_LOG_HEADER(
log, " Options.max_log_file_size: %" ROCKSDB_PRIszt,
max_log_file_size);
- ROCKS_LOG_HEADER(log,
- " Options.max_manifest_file_size: %" PRIu64,
- max_manifest_file_size);
ROCKS_LOG_HEADER(
log, " Options.log_file_time_to_roll: %" ROCKSDB_PRIszt,
log_file_time_to_roll);
" "
"Options.max_write_batch_group_size_bytes: %" PRIu64,
max_write_batch_group_size_bytes);
- ROCKS_LOG_HEADER(
- log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
- manifest_preallocation_size);
ROCKS_LOG_HEADER(log, " Options.is_fd_close_on_exec: %d",
is_fd_close_on_exec);
ROCKS_LOG_HEADER(log, " Options.advise_random_on_open: %d",
}
}
-MutableDBOptions::MutableDBOptions()
- : max_background_jobs(2),
- max_background_compactions(-1),
- max_subcompactions(0),
- avoid_flush_during_shutdown(false),
- writable_file_max_buffer_size(1024 * 1024),
- delayed_write_rate(2 * 1024U * 1024U),
- max_total_wal_size(0),
- delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
- stats_dump_period_sec(600),
- stats_persist_period_sec(600),
- stats_history_buffer_size(1024 * 1024),
- max_open_files(-1),
- bytes_per_sync(0),
- wal_bytes_per_sync(0),
- strict_bytes_per_sync(false),
- compaction_readahead_size(0),
- max_background_flushes(-1) {}
+MutableDBOptions::MutableDBOptions() : MutableDBOptions(DBOptions{}) {}
MutableDBOptions::MutableDBOptions(const DBOptions& options)
: max_background_jobs(options.max_background_jobs),
strict_bytes_per_sync(options.strict_bytes_per_sync),
compaction_readahead_size(options.compaction_readahead_size),
max_background_flushes(options.max_background_flushes),
+ max_manifest_file_size(options.max_manifest_file_size),
+ max_manifest_space_amp_pct(options.max_manifest_space_amp_pct),
+ manifest_preallocation_size(options.manifest_preallocation_size),
daily_offpeak_time_utc(options.daily_offpeak_time_utc) {}
void MutableDBOptions::Dump(Logger* log) const {
compaction_readahead_size);
ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d",
max_background_flushes);
+ ROCKS_LOG_HEADER(log,
+ " Options.max_manifest_file_size: %" PRIu64,
+ max_manifest_file_size);
+ ROCKS_LOG_HEADER(log,
+ " Options.max_manifest_space_amp_pct: %d",
+ max_manifest_space_amp_pct);
+ ROCKS_LOG_HEADER(
+ log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
+ manifest_preallocation_size);
ROCKS_LOG_HEADER(log, "Options.daily_offpeak_time_utc: %s",
daily_offpeak_time_utc.c_str());
}
size_t log_file_time_to_roll;
size_t keep_log_file_num;
size_t recycle_log_file_num;
- uint64_t max_manifest_file_size;
int table_cache_numshardbits;
uint64_t WAL_ttl_seconds;
uint64_t WAL_size_limit_MB;
uint64_t max_write_batch_group_size_bytes;
- size_t manifest_preallocation_size;
bool allow_mmap_reads;
bool allow_mmap_writes;
bool use_direct_reads;
bool strict_bytes_per_sync;
size_t compaction_readahead_size;
int max_background_flushes;
+ uint64_t max_manifest_file_size;
+ int max_manifest_space_amp_pct;
+ size_t manifest_preallocation_size;
std::string daily_offpeak_time_utc;
};
options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll;
options.keep_log_file_num = immutable_db_options.keep_log_file_num;
options.recycle_log_file_num = immutable_db_options.recycle_log_file_num;
- options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
+ options.max_manifest_file_size = mutable_db_options.max_manifest_file_size;
+ options.max_manifest_space_amp_pct =
+ mutable_db_options.max_manifest_space_amp_pct;
options.table_cache_numshardbits =
immutable_db_options.table_cache_numshardbits;
options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
options.manifest_preallocation_size =
- immutable_db_options.manifest_preallocation_size;
+ mutable_db_options.manifest_preallocation_size;
options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
options.use_direct_reads = immutable_db_options.use_direct_reads;
"skip_stats_update_on_db_open=false;"
"skip_checking_sst_file_sizes_on_db_open=false;"
"max_manifest_file_size=4295009941;"
+ "max_manifest_space_amp_pct=321;"
"db_log_dir=path/to/db_log_dir;"
"writable_file_max_buffer_size=1048576;"
"paranoid_checks=true;"
{"keep_log_file_num", "39"},
{"recycle_log_file_num", "5"},
{"max_manifest_file_size", "40"},
+ {"max_manifest_space_amp_pct", "42"},
{"table_cache_numshardbits", "41"},
{"WAL_ttl_seconds", "43"},
{"WAL_size_limit_MB", "44"},
ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
- ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+ ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+ ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
{"keep_log_file_num", "39"},
{"recycle_log_file_num", "5"},
{"max_manifest_file_size", "40"},
+ {"max_manifest_space_amp_pct", "42"},
{"table_cache_numshardbits", "41"},
{"WAL_ttl_seconds", "43"},
{"WAL_size_limit_MB", "44"},
ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
- ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+ ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+ ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
db/db_clip_test.cc \
db/db_dynamic_level_test.cc \
db/db_encryption_test.cc \
+ db/db_etc3_test.cc \
db/db_flush_test.cc \
db/db_follower_test.cc \
db/db_readonly_with_timestamp_test.cc \
ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
"Number of bytes to buffer in all memtables before compacting");
+DEFINE_int64(max_manifest_file_size,
+ ROCKSDB_NAMESPACE::Options().max_manifest_file_size,
+ "Max manifest file size (or minimum max with auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct,
+ ROCKSDB_NAMESPACE::Options().max_manifest_space_amp_pct,
+ "Max manifest space amp percentage for auto-tuning");
+
DEFINE_bool(cost_write_buffer_to_cache, false,
"The usage of memtable is costed to the block cache");
options.write_buffer_manager.reset(
new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
}
+ options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+ options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
options.arena_block_size = FLAGS_arena_block_size;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number;
# Test small max_manifest_file_size in a smaller chance, as most of the
# time we wnat manifest history to be preserved to help debug
"max_manifest_file_size": lambda: random.choice(
- [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]
+ [t * 2048 if t < 5 else 1024 * 1024 * 1024 for t in range(1, 30)]
),
+ "max_manifest_space_amp_pct": lambda: random.choice([0, 10, 100, 1000]),
# Sync mode might make test runs slower so running it in a smaller chance
"sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]),
"bytes_per_sync": lambda: random.choice([0, 262144]),
WriteController wc(options.delayed_write_rate);
WriteBufferManager wb(options.db_write_buffer_size);
ImmutableDBOptions immutable_db_options(options);
- VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+ VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{}, sopt,
+ tc.get(), &wb, &wc,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"",
options.daily_offpeak_time_utc,
WriteController wc(options.delayed_write_rate);
WriteBufferManager wb(options.db_write_buffer_size);
ImmutableDBOptions immutable_db_options(options);
- VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+ VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{options},
+ sopt, tc.get(), &wb, &wc,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"",
options.daily_offpeak_time_utc,
const InternalKeyComparator cmp(opt.comparator);
WriteController wc(opt.delayed_write_rate);
WriteBufferManager wb(opt.db_write_buffer_size);
- VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+ VersionSet versions(db_path_, &db_options, MutableDBOptions{opt}, soptions,
+ tc.get(), &wb, &wc,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"",
opt.daily_offpeak_time_utc,
WriteController wc(options_.delayed_write_rate);
WriteBufferManager wb(options_.db_write_buffer_size);
ImmutableDBOptions immutable_db_options(options_);
- VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
- &wc, nullptr, nullptr, "", "",
+ VersionSet versions(dbname_, &immutable_db_options,
+ MutableDBOptions{options_}, sopt, tc.get(), &wb, &wc,
+ nullptr, nullptr, "", "",
options_.daily_offpeak_time_utc, nullptr,
/*read_only=*/false);
std::vector<std::string> cf_name_list;
--- /dev/null
+* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
DestroyDBWithoutCheck(dbname_, options_);
options_.max_manifest_file_size = 0; // always rollover manifest for file add
+ options_.max_manifest_space_amp_pct = 0;
OpenDBAndBackupEngine(true);
FillDB(db_.get(), 0, 100, kAutoFlushOnly);
TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
Options options = CurrentOptions();
options.max_manifest_file_size = 0; // always rollover manifest for file add
+ options.max_manifest_space_amp_pct = 0;
Reopen(options);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(