From bb340c8d7a47d84c54b15c6031a97c3155f9d253 Mon Sep 17 00:00:00 2001 From: haodong Date: Fri, 28 Oct 2016 11:59:07 +0800 Subject: [PATCH] kv/rocksdb: enable rocksdb write path breakdown bluestore has unstable write performance partly because of rocksdb internal overhead. So we add more breakdown in rocksdb writh path. Signed-off-by: Haodong Tang --- src/common/config_opts.h | 1 + src/kv/RocksDBStore.cc | 59 ++++++++++++++++++++++++++++++++++++++-- src/kv/RocksDBStore.h | 5 ++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2cf5080d914..eff9ae3f702 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1008,6 +1008,7 @@ OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128) OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048... OPTION(bluestore_bitmapallocator_span_size, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048... OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456") +OPTION(bluestore_rocksdb_perf, OPT_BOOL, false) // rocksdb breakdown OPTION(bluestore_fsck_on_mount, OPT_BOOL, false) OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL, true) OPTION(bluestore_fsck_on_umount, OPT_BOOL, false) diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 989a375e517..f3e743d4d78 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -277,7 +277,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) dout(10) << __func__ << " using custom Env " << priv << dendl; opt.env = static_cast(priv); } - + auto cache = rocksdb::NewLRUCache(g_conf->rocksdb_cache_size, g_conf->rocksdb_cache_shard_bits); rocksdb::BlockBasedTableOptions bbt_opts; bbt_opts.block_size = g_conf->rocksdb_block_size; @@ -293,7 +293,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) derr << status.ToString() << dendl; return -EINVAL; } - + PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last); plb.add_u64_counter(l_rocksdb_gets, "get", "Gets"); plb.add_u64_counter(l_rocksdb_txns, "submit_transaction", "Submit transactions"); @@ -305,6 +305,11 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing) plb.add_u64_counter(l_rocksdb_compact_range, "compact_range", "Compactions by range"); plb.add_u64_counter(l_rocksdb_compact_queue_merge, "compact_queue_merge", "Mergings of ranges in compaction queue"); plb.add_u64(l_rocksdb_compact_queue_len, "compact_queue_len", "Length of compaction queue"); + plb.add_time_avg(l_rocksdb_write_wal_time, "rocksdb_write_wal_time", "Rocksdb write wal time"); + plb.add_time_avg(l_rocksdb_write_memtable_time, "rocksdb_write_memtable_time", "Rocksdb write memtable time"); + plb.add_time_avg(l_rocksdb_write_delay_time, "rocksdb_write_delay_time", "Rocksdb write delay time"); + plb.add_time_avg(l_rocksdb_write_pre_and_post_process_time, + "rocksdb_write_pre_and_post_time", "total time spent on writing a record, excluding write process"); logger = plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); @@ -361,6 +366,13 @@ void RocksDBStore::close() int RocksDBStore::submit_transaction(KeyValueDB::Transaction t) { utime_t start = ceph_clock_now(g_ceph_context); + // enable rocksdb breakdown + // considering performance overhead, default is disabled + if (g_conf->bluestore_rocksdb_perf) { + rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); + rocksdb::perf_context.Reset(); + } + RocksDBTransactionImpl * _t = static_cast(t.get()); rocksdb::WriteOptions woptions; @@ -378,14 +390,39 @@ int RocksDBStore::submit_transaction(KeyValueDB::Transaction t) << " Rocksdb transaction: " << rocks_txc.seen << dendl; } utime_t lat = ceph_clock_now(g_ceph_context) - start; + utime_t write_memtable_time; + utime_t write_delay_time; + utime_t write_wal_time; + utime_t write_pre_and_post_process_time; + write_wal_time.set_from_double( + static_cast(rocksdb::perf_context.write_wal_time)/1000000000); + write_memtable_time.set_from_double( + static_cast(rocksdb::perf_context.write_memtable_time)/1000000000); + write_delay_time.set_from_double( + static_cast(rocksdb::perf_context.write_delay_time)/1000000000); + write_pre_and_post_process_time.set_from_double( + static_cast(rocksdb::perf_context.write_pre_and_post_process_time)/1000000000); + logger->inc(l_rocksdb_txns); logger->tinc(l_rocksdb_submit_latency, lat); + logger->tinc(l_rocksdb_write_memtable_time, write_memtable_time); + logger->tinc(l_rocksdb_write_delay_time, write_delay_time); + logger->tinc(l_rocksdb_write_wal_time, write_wal_time); + logger->tinc(l_rocksdb_write_pre_and_post_process_time, write_pre_and_post_process_time); + return s.ok() ? 0 : -1; } int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t) { utime_t start = ceph_clock_now(g_ceph_context); + // enable rocksdb breakdown + // considering performance overhead, default is disabled + if (g_conf->bluestore_rocksdb_perf) { + rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); + rocksdb::perf_context.Reset(); + } + RocksDBTransactionImpl * _t = static_cast(t.get()); rocksdb::WriteOptions woptions; @@ -404,8 +441,26 @@ int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t) << " Rocksdb transaction: " << rocks_txc.seen << dendl; } utime_t lat = ceph_clock_now(g_ceph_context) - start; + utime_t write_memtable_time; + utime_t write_delay_time; + utime_t write_wal_time; + utime_t write_pre_and_post_process_time; + write_wal_time.set_from_double( + static_cast(rocksdb::perf_context.write_wal_time)/1000000000); + write_memtable_time.set_from_double( + static_cast(rocksdb::perf_context.write_memtable_time)/1000000000); + write_delay_time.set_from_double( + static_cast(rocksdb::perf_context.write_delay_time)/1000000000); + write_pre_and_post_process_time.set_from_double( + static_cast(rocksdb::perf_context.write_pre_and_post_process_time)/1000000000); + logger->inc(l_rocksdb_txns_sync); logger->tinc(l_rocksdb_submit_sync_latency, lat); + logger->tinc(l_rocksdb_write_memtable_time, write_memtable_time); + logger->tinc(l_rocksdb_write_delay_time, write_delay_time); + logger->tinc(l_rocksdb_write_wal_time, write_wal_time); + logger->tinc(l_rocksdb_write_pre_and_post_process_time, write_pre_and_post_process_time); + return s.ok() ? 0 : -1; } int RocksDBStore::get_info_log_level(string info_log_level) diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 371ca6ca6de..6c43bf1eb71 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -12,6 +12,7 @@ #include #include #include "rocksdb/write_batch.h" +#include "rocksdb/perf_context.h" #include #include "common/errno.h" #include "common/dout.h" @@ -34,6 +35,10 @@ enum { l_rocksdb_compact_range, l_rocksdb_compact_queue_merge, l_rocksdb_compact_queue_len, + l_rocksdb_write_wal_time, + l_rocksdb_write_memtable_time, + l_rocksdb_write_delay_time, + l_rocksdb_write_pre_and_post_process_time, l_rocksdb_last, }; -- 2.39.5