From: Xiaoxi Chen Date: Fri, 27 Mar 2015 08:08:10 +0000 (+0800) Subject: Update RocksDB configuration to make it more clear X-Git-Tag: v9.0.0~56^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0bd767fb7ecca78033dc9d99f221e88ad0c4b289;p=ceph.git Update RocksDB configuration to make it more clear Add some performance critial configurations Also group and polish the description of each configuration to make it more clear, changed the default from 0 to actual value. Signed-off-by: Xiaoxi Chen --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2b9c7e778ef0..894e2cab645c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -697,27 +697,40 @@ OPTION(kinetic_user_id, OPT_INT, 1) // kinetic user to authenticate as OPTION(kinetic_hmac_key, OPT_STR, "asdfasdf") // kinetic key to authenticate with OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS -OPTION(rocksdb_compact_on_mount, OPT_BOOL, false) -OPTION(rocksdb_write_buffer_size, OPT_U64, 0) // rocksdb write buffer size -OPTION(rocksdb_target_file_size_base, OPT_U64, 0) // target file size for compaction + +//in memory write buffer configuration +OPTION(rocksdb_write_buffer_size, OPT_U64, 8*1024*1024) // rocksdb write buffer size, should be larger than average write size. +OPTION(rocksdb_write_buffer_num, OPT_INT, 2) // The maximum number of write buffers that are built up in memory. +OPTION(rocksdb_min_write_buffer_number_to_merge, OPT_INT, 1) // The min write buffers that will be merged together before writing to storage. +//on disk level0 configuration +OPTION(rocksdb_level0_file_num_compaction_trigger, OPT_INT, 4) // Number of files to trigger level-0 compaction +OPTION(rocksdb_level0_slowdown_writes_trigger, OPT_INT, -1) // number of level-0 files at which we start slowing down write. -1 means not set. +OPTION(rocksdb_level0_stop_writes_trigger, OPT_INT, -1) // number of level-0 files at which we stop writes. -1 means not set. +//on disk level1+ configuration +OPTION(rocksdb_max_bytes_for_level_base, OPT_U64, 10*1024*1024) // max total bytes for level 1 +OPTION(rocksdb_max_bytes_for_level_multiplier, OPT_INT, 10) // max total bytes for level 1 +OPTION(rocksdb_target_file_size_base, OPT_U64, 2*1024*1024) // target file size for level 1 +OPTION(rocksdb_target_file_size_multiplier, OPT_INT, 1) // target file size for Level-N = (multiplier)^(N-1) * file_size_base +OPTION(rocksdb_num_levels, OPT_INT, 7) // number of levels for this database,chang OPTION(rocksdb_cache_size, OPT_U64, 0) // rocksdb cache size -OPTION(rocksdb_block_size, OPT_U64, 0) // rocksdb block size +OPTION(rocksdb_block_size, OPT_U64, 4*1024) // rocksdb block size OPTION(rocksdb_bloom_bits_per_key, OPT_INT, 10) // rocksdb bloom bits per entry -OPTION(rocksdb_write_buffer_num, OPT_INT, 0) // rocksdb bloom bits per entry -OPTION(rocksdb_background_compactions, OPT_INT, 0) // number for background compaction jobs -OPTION(rocksdb_background_flushes, OPT_INT, 0) // number for background flush jobs -OPTION(rocksdb_max_open_files, OPT_INT, 0) // rocksdb max open files -OPTION(rocksdb_compression, OPT_STR, "") // rocksdb uses compression : none, snappy, zlib, bzip2 -OPTION(rocksdb_paranoid, OPT_BOOL, false) // rocksdb paranoid flag +//concurrency of compaction and flush +OPTION(rocksdb_max_background_compactions, OPT_INT, 1) // number for background compaction jobs +OPTION(rocksdb_compaction_threads, OPT_INT, 1) // number for background compaction jobs +OPTION(rocksdb_max_background_flushes, OPT_INT, 1) // number for background flush jobs +OPTION(rocksdb_flusher_threads, OPT_INT, 1) // number for background compaction jobs +//Other +OPTION(rocksdb_max_open_files, OPT_INT, 5000) // rocksdb max open files +OPTION(rocksdb_compression, OPT_STR, "snappy") // rocksdb uses compression : none, snappy, zlib, bzip2 +OPTION(rocksdb_compact_on_mount, OPT_BOOL, false) +OPTION(rocksdb_paranoid, OPT_BOOL, false) // RocksDB will aggressively check consistency of the data. OPTION(rocksdb_log, OPT_STR, "/dev/null") // enable rocksdb log file -OPTION(rocksdb_level0_file_num_compaction_trigger, OPT_U64, 0) // Number of files to trigger level-0 compaction -OPTION(rocksdb_level0_slowdown_writes_trigger, OPT_U64, 0) // number of level-0 files at which we start slowing down write. -OPTION(rocksdb_level0_stop_writes_trigger, OPT_U64, 0) // number of level-0 files at which we stop writes -OPTION(rocksdb_disableDataSync, OPT_BOOL, true) // if true, data files are not synced to stable storage -OPTION(rocksdb_disableWAL, OPT_BOOL, false) // diable write ahead log -OPTION(rocksdb_num_levels, OPT_INT, 0) // number of levels for this database -OPTION(rocksdb_wal_dir, OPT_STR, "") // rocksdb write ahead log file OPTION(rocksdb_info_log_level, OPT_STR, "info") // info log level : debug , info , warn, error, fatal +OPTION(rocksdb_wal_dir, OPT_STR, "") // rocksdb write ahead log file, put it to fast device will benifit wrtie performance +OPTION(rocksdb_disableDataSync, OPT_BOOL, true) // if true, data files are not synced to stable storage +OPTION(rocksdb_disableWAL, OPT_BOOL, false) // if true, writes will not first go to the write ahead log + /** * osd_client_op_priority and osd_recovery_op_priority adjust the relative diff --git a/src/os/RocksDBStore.cc b/src/os/RocksDBStore.cc index 237b098905f5..9d8104bfb0a3 100644 --- a/src/os/RocksDBStore.cc +++ b/src/os/RocksDBStore.cc @@ -24,25 +24,35 @@ using std::string; int RocksDBStore::init() { options.write_buffer_size = g_conf->rocksdb_write_buffer_size; + options.write_buffer_num = g_conf->rocksdb_write_buffer_num; + options.min_write_buffer_number_to_merge = g_conf->rocksdb_min_write_buffer_number_to_merge; + + options.level0_file_num_compaction_trigger = g_conf->rocksdb_level0_file_num_compaction_trigger; + options.level0_slowdown_writes_trigger = g_conf->rocksdb_level0_slowdown_writes_trigger; + options.level0_stop_writes_trigger = g_conf->rocksdb_level0_stop_writes_trigger; + + options.max_bytes_for_level_base = g_conf->rocksdb_max_bytes_for_level_base; + options.max_bytes_for_level_multiplier = g_conf->rocksdb_max_bytes_for_level_multiplier; + options.target_file_size_base = g_conf->rocksdb_target_file_size_base; + options.target_file_size_multiplier = g_conf->rocksdb_target_file_size_multiplier; + options.num_levels = g_conf->rocksdb_num_levels; options.cache_size = g_conf->rocksdb_cache_size; options.block_size = g_conf->rocksdb_block_size; options.bloom_bits_per_key = g_conf->rocksdb_bloom_bits_per_key; + + options.max_background_compactions = g_conf->rocksdb_max_background_compactions; + options.compaction_threads = g_conf->rocksdb_compaction_threads; + options.max_background_flushes = g_conf->rocksdb_max_background_flushes; + options.flusher_threads = g_conf->rocksdb_flusher_threads; + + options.max_open_files = g_conf->rocksdb_max_open_files; options.compression_type = g_conf->rocksdb_compression; options.paranoid_checks = g_conf->rocksdb_paranoid; - options.max_open_files = g_conf->rocksdb_max_open_files; options.log_file = g_conf->rocksdb_log; - options.write_buffer_num = g_conf->rocksdb_write_buffer_num; - options.max_background_compactions = g_conf->rocksdb_background_compactions; - options.max_background_flushes = g_conf->rocksdb_background_flushes; - options.target_file_size_base = g_conf->rocksdb_target_file_size_base; - options.level0_file_num_compaction_trigger = g_conf->rocksdb_level0_file_num_compaction_trigger; - options.level0_slowdown_writes_trigger = g_conf->rocksdb_level0_slowdown_writes_trigger; - options.level0_stop_writes_trigger = g_conf->rocksdb_level0_stop_writes_trigger; + options.info_log_level = g_conf->rocksdb_info_log_level; + options.wal_dir = g_conf->rocksdb_wal_dir; options.disableDataSync = g_conf->rocksdb_disableDataSync; - options.num_levels = g_conf->rocksdb_num_levels; options.disableWAL = g_conf->rocksdb_disableWAL; - options.wal_dir = g_conf->rocksdb_wal_dir; - options.info_log_level = g_conf->rocksdb_info_log_level; return 0; } @@ -50,27 +60,37 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) { rocksdb::Options ldoptions; rocksdb::BlockBasedTableOptions table_options; + auto env = rocksdb::Env::Default(); + + ldoptions.write_buffer_size = options.write_buffer_size; + ldoptions.max_write_buffer_number = options.write_buffer_num; + ldoptions.min_write_buffer_number_to_merge = options.min_write_buffer_number_to_merge; + + ldoptions.level0_file_num_compaction_trigger = options.level0_file_num_compaction_trigger; + if(options.level0_slowdown_writes_trigger >= 0) + ldoptions.level0_slowdown_writes_trigger = options.level0_slowdown_writes_trigger; + if(options.level0_stop_writes_trigger >= 0) + ldoptions.level0_stop_writes_trigger = options.level0_stop_writes_trigger; - if (options.write_buffer_size) - ldoptions.write_buffer_size = options.write_buffer_size; - if (options.write_buffer_num) - ldoptions.max_write_buffer_number = options.write_buffer_num; - if (options.max_background_compactions) - ldoptions.max_background_compactions = options.max_background_compactions; - if (options.max_background_flushes) - ldoptions.max_background_flushes = options.max_background_flushes; - if (options.target_file_size_base) - ldoptions.target_file_size_base = options.target_file_size_base; - if (options.max_open_files) - ldoptions.max_open_files = options.max_open_files; + ldoptions.max_bytes_for_level_base = options.max_bytes_for_level_base; + ldoptions.max_bytes_for_level_multiplier = options.max_bytes_for_level_multiplier; + ldoptions.target_file_size_base = options.target_file_size_base; + ldoptions.target_file_size_multiplier = options.target_file_size_multiplier; + ldoptions.num_levels = options.num_levels; if (options.cache_size) { table_options.block_cache = rocksdb::NewLRUCache(options.cache_size); } - if (options.block_size) - table_options.block_size = options.block_size; - if (options.bloom_bits_per_key) { - table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(options.bloom_bits_per_key, true)); - } + table_options.block_size = options.block_size; + table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(options.bloom_bits_per_key, true)); + + ldoptions.max_background_compactions = options.max_background_compactions; + ldoptions.max_background_flushes = options.max_background_flushes; + //High priority threadpool is used for flusher + env->SetBackgroundThreads(options.flusher_threads, rocksdb::Env::Priority::HIGH); + //Low priority threadpool is used for compaction + env->SetBackgroundThreads(options.compaction_threads, rocksdb::Env::Priority::LOW); + + ldoptions.max_open_files = options.max_open_files; if (options.compression_type.length() == 0) ldoptions.compression = rocksdb::kNoCompression; else if(options.compression_type == "snappy") @@ -81,6 +101,18 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) ldoptions.compression = rocksdb::kBZip2Compression; else ldoptions.compression = rocksdb::kNoCompression; + + if(options.disableDataSync) { + derr << "Warning: DataSync is disabled, may lose data on node failure" << dendl; + ldoptions.disableDataSync = options.disableDataSync; + } + + if(options.disableWAL) { + derr << "Warning: Write Ahead Log is disabled, may lose data on failure" << dendl; + } + if(options.wal_dir.length()) + ldoptions.wal_dir = options.wal_dir; + if (options.block_restart_interval) table_options.block_restart_interval = options.block_restart_interval; @@ -88,28 +120,17 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) ldoptions.paranoid_checks = options.paranoid_checks; ldoptions.create_if_missing = create_if_missing; if (options.log_file.length()) { - rocksdb::Env *env = rocksdb::Env::Default(); env->NewLogger(options.log_file, &ldoptions.info_log); ldoptions.info_log->SetInfoLogLevel((rocksdb::InfoLogLevel)get_info_log_level(options.info_log_level)); } else { ldoptions.info_log_level = (rocksdb::InfoLogLevel)get_info_log_level(options.info_log_level); } - if(options.disableDataSync) - ldoptions.disableDataSync = options.disableDataSync; - if(options.num_levels) - ldoptions.num_levels = options.num_levels; - if(options.level0_file_num_compaction_trigger) - ldoptions.level0_file_num_compaction_trigger = options.level0_file_num_compaction_trigger; - if(options.level0_slowdown_writes_trigger) - ldoptions.level0_slowdown_writes_trigger = options.level0_slowdown_writes_trigger; - if(options.level0_stop_writes_trigger) - ldoptions.level0_stop_writes_trigger = options.level0_stop_writes_trigger; - if(options.wal_dir.length()) - ldoptions.wal_dir = options.wal_dir; //apply table_options ldoptions.table_factory.reset(NewBlockBasedTableFactory(table_options)); + //apply env setting + ldoptions.env = env; //rocksdb::DB *_db; rocksdb::Status status = rocksdb::DB::Open(ldoptions, path, &db); if (!status.ok()) { diff --git a/src/os/RocksDBStore.h b/src/os/RocksDBStore.h index 71c0854c82fc..30803a348b7a 100644 --- a/src/os/RocksDBStore.h +++ b/src/os/RocksDBStore.h @@ -110,48 +110,68 @@ public: */ struct options_t { uint64_t write_buffer_size; /// in-memory write buffer size - uint64_t write_buffer_num; /// in-memory write buffer number - uint64_t target_file_size_base; /// Target file size for compaction - int max_background_compactions; /// Maximum number of concurrent background compaction jobs - int max_background_flushes; /// Maximum number of concurrent background memtable flushea jobs - int max_open_files; /// maximum number of files RocksDB can open at once + int write_buffer_num; /// in-memory write buffer number + int min_write_buffer_number_to_merge; + + int level0_file_num_compaction_trigger; + int level0_slowdown_writes_trigger; + int level0_stop_writes_trigger; + + uint64_t max_bytes_for_level_base; + int max_bytes_for_level_multiplier; + uint64_t target_file_size_base; + int target_file_size_multiplier; + int num_levels; uint64_t cache_size; /// size of extra decompressed cache to use uint64_t block_size; /// user data per block int bloom_bits_per_key; /// number of bits per entry to put in a bloom filter - string compression_type; /// whether to use libsnappy compression or not + + int max_background_compactions; + int compaction_threads; + int max_background_flushes; + int flusher_threads; - // don't change these ones. No, seriously - int block_restart_interval; - bool error_if_exists; + uint64_t max_open_files; + string compression_type; /// whether to use libsnappy compression or not bool paranoid_checks; - uint64_t level0_file_num_compaction_trigger; - uint64_t level0_slowdown_writes_trigger; - uint64_t level0_stop_writes_trigger; - bool disableDataSync; - bool disableWAL; - int num_levels; - string log_file; string wal_dir; string info_log_level; + bool disableDataSync; + bool disableWAL; + + int block_restart_interval; + bool error_if_exists; options_t() : - write_buffer_size(0), //< 0 means default - max_open_files(0), //< 0 means default - cache_size(0), //< 0 means no cache (default) - block_size(0), //< 0 means default - bloom_bits_per_key(10), //< 10 is the default value which yields ~1% false positive rate. - compression_type("none"), //< set to false for no compression - block_restart_interval(0), //< 0 means default - error_if_exists(false), //< set to true if you want to check nonexistence - paranoid_checks(false), //< set to true if you want paranoid checks + write_buffer_size(0), + write_buffer_num(0), + min_write_buffer_number_to_merge(0), level0_file_num_compaction_trigger(0), - level0_slowdown_writes_trigger(0), - level0_stop_writes_trigger(0), + level0_slowdown_writes_trigger(-1), + level0_stop_writes_trigger(-1), + max_bytes_for_level_base(0), + max_bytes_for_level_multiplier(0), + target_file_size_base(0), + target_file_size_multiplier(0), + num_levels(0), + cache_size(0), /// size of extra decompressed cache to use + block_size(0), /// user data per block + bloom_bits_per_key(0), /// number of bits per entry to put in a bloom filter + max_background_compactions(0), + compaction_threads(0), + max_background_flushes(0), + flusher_threads(0), + + max_open_files(0), + compression_type("none"), + paranoid_checks(false), //< set to true if you want paranoid checks + info_log_level("info"), disableDataSync(false), disableWAL(false), - num_levels(0), - info_log_level("info") + + block_restart_interval(0), //< 0 means default + error_if_exists(false) //< set to true if you want to check nonexistence {} } options;