]> git-server-git.apps.pok.os.sepia.ceph.com Git - rocksdb.git/commitdiff
Add an option wal_bytes_per_sync to control sync_file_range for WAL files
authorIgor Canadi <icanadi@fb.com>
Tue, 19 May 2015 00:03:59 +0000 (17:03 -0700)
committerIgor Canadi <icanadi@fb.com>
Tue, 19 May 2015 00:03:59 +0000 (17:03 -0700)
Summary:
sync_file_range is not always asyncronous and thus can block writes if we do this for WAL in the foreground thread. See more here: http://yoshinorimatsunobu.blogspot.com/2014/03/how-syncfilerange-really-works.html

Some users don't want us to call sync_file_range on WALs. Some other do.
Thus, I'm adding a separate option wal_bytes_per_sync to control calling
sync_file_range on WAL files. bytes_per_sync will apply only to table
files now.

Test Plan: no more sync_file_range for WAL as evidenced by strace

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D38253

HISTORY.md
db/db_impl.cc
include/rocksdb/env.h
include/rocksdb/options.h
util/env.cc
util/env_posix.cc
util/options.cc
util/options_helper.cc
util/options_test.cc

index 09a34e19fabe9d0bfbcb64cb13eb8deb5927635d..95c4f968800f38ab27bbcdf88e476fa5047e96bc 100644 (file)
@@ -11,6 +11,7 @@
 
 ### Public API changes
 * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
+* DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
 
 ## 3.10.0 (3/24/2015)
 ### New Features
index 19ce9c311e1874256b45286e26de9d2734e0e424..797f2d8ff0423ab5c58c5d88a2efbac3d82baf88 100644 (file)
@@ -3455,7 +3455,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     if (creating_new_log) {
       s = env_->NewWritableFile(
           LogFileName(db_options_.wal_dir, new_log_number), &lfile,
-          env_->OptimizeForLogWrite(env_options_));
+          env_->OptimizeForLogWrite(env_options_, db_options_));
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
@@ -3965,7 +3965,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     EnvOptions soptions(db_options);
     s = impl->db_options_.env->NewWritableFile(
         LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
-        impl->db_options_.env->OptimizeForLogWrite(soptions));
+        impl->db_options_.env->OptimizeForLogWrite(soptions,
+                                                   impl->db_options_));
     if (s.ok()) {
       lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
       impl->logfile_number_ = new_log_number;
index d7403f8c0a3807370e6dbc5fe87233b7e4db9a4f..2fb92421488c70360698a84fbb2b4cb329da6a18 100644 (file)
@@ -300,7 +300,8 @@ class Env {
   // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
   // the EnvOptions in the parameters, but is optimized for writing log files.
   // Default implementation returns the copy of the same object.
-  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                         const DBOptions& db_options) const;
   // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
   // of the EnvOptions in the parameters, but is optimized for writing manifest
   // files. Default implementation returns the copy of the same object.
index beee23a1adf346955e023cc9082b97d130230646..ea11b8132b4999534e4b7d64a5b5baa67ccd3a93 100644 (file)
@@ -1001,8 +1001,14 @@ struct DBOptions {
   // You may consider using rate_limiter to regulate write rate to device.
   // When rate limiter is enabled, it automatically enables bytes_per_sync
   // to 1MB.
+  //
+  // This option applies to table files
   uint64_t bytes_per_sync;
 
+  // Same as bytes_per_sync, but applies to WAL files
+  // Default: 0, turned off
+  uint64_t wal_bytes_per_sync;
+
   // If true, then the status of the threads involved in this DB will
   // be tracked and available via GetThreadList() API.
   //
index a95205273aadee67c24dc0b7ab34641602f974e5..0695b551aa784440513ca7f8982f6c167f632fb1 100644 (file)
@@ -249,8 +249,11 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
 
 }
 
-EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
-  return env_options;
+EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
+                                    const DBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+  return optimized_env_options;
 }
 
 EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
index 891e4fbdf6767ced4444dce6e82ef9eb0293bcbb..c8f50720ba1dd207c2cb0fa882a65c95f542aa27 100644 (file)
@@ -1510,9 +1510,11 @@ class PosixEnv : public Env {
     return dummy;
   }
 
-  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const override {
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
     EnvOptions optimized = env_options;
     optimized.use_mmap_writes = false;
+    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
     // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
     // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
     // test and make this false
index 3f56d276ea360bc85cf7f12993bb653961140604..6bb462aa55d290f146f2cbfd454d5ca777fa1848 100644 (file)
@@ -255,6 +255,7 @@ DBOptions::DBOptions()
       access_hint_on_compaction_start(NORMAL),
       use_adaptive_mutex(false),
       bytes_per_sync(0),
+      wal_bytes_per_sync(0),
       enable_thread_tracking(false) {
 }
 
@@ -298,6 +299,7 @@ DBOptions::DBOptions(const Options& options)
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       use_adaptive_mutex(options.use_adaptive_mutex),
       bytes_per_sync(options.bytes_per_sync),
+      wal_bytes_per_sync(options.wal_bytes_per_sync),
       enable_thread_tracking(options.enable_thread_tracking) {}
 
 static const char* const access_hints[] = {
@@ -364,6 +366,8 @@ void DBOptions::Dump(Logger* log) const {
         rate_limiter.get());
     Log(log, "                          Options.bytes_per_sync: %" PRIu64,
         bytes_per_sync);
+    Log(log, "                      Options.wal_bytes_per_sync: %" PRIu64,
+        wal_bytes_per_sync);
     Log(log, "                  Options.enable_thread_tracking: %d",
         enable_thread_tracking);
 }  // DBOptions::Dump
index 8f982c196fb21b01fa25625c6a0ae745bf1fc1d0..07fc0531c1edd04cbc865cb8c710ab52bcf1678e 100644 (file)
@@ -555,6 +555,8 @@ bool ParseDBOption(const std::string& name, const std::string& value,
       new_options->use_adaptive_mutex = ParseBoolean(name, value);
     } else if (name == "bytes_per_sync") {
       new_options->bytes_per_sync = ParseUint64(value);
+    } else if (name == "wal_bytes_per_sync") {
+      new_options->wal_bytes_per_sync = ParseUint64(value);
     } else {
       return false;
     }
index 1fcb1e8b8cbe551dda56be09ce6ebf3527d6f69f..6a3b2d404acf9115bf034bc1e96c03e8a5687d5c 100644 (file)
@@ -172,6 +172,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
     {"advise_random_on_open", "true"},
     {"use_adaptive_mutex", "false"},
     {"bytes_per_sync", "47"},
+    {"wal_bytes_per_sync", "48"},
   };
 
   ColumnFamilyOptions base_cf_opt;
@@ -278,6 +279,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
   ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
 }
 #endif  // !ROCKSDB_LITE