]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
kv/rocksdb: enable rocksdb write path breakdown
authorhaodong <haodong.tang@intel.com>
Fri, 28 Oct 2016 03:59:07 +0000 (11:59 +0800)
committerhaodong <haodong.tang@intel.com>
Wed, 2 Nov 2016 04:57:49 +0000 (12:57 +0800)
bluestore has unstable write performance partly because of
rocksdb internal overhead. So we add more breakdown in
rocksdb writh path.

Signed-off-by: Haodong Tang <haodong.tang@intel.com>
src/common/config_opts.h
src/kv/RocksDBStore.cc
src/kv/RocksDBStore.h

index 2cf5080d91444d5bf3ea2ccb2fb6a55fb42f23bf..eff9ae3f7023f97d4d5400b1953a3fe0a0e490c9 100644 (file)
@@ -1008,6 +1008,7 @@ OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128)
 OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
 OPTION(bluestore_bitmapallocator_span_size, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
 OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456")
+OPTION(bluestore_rocksdb_perf, OPT_BOOL, false) // rocksdb breakdown
 OPTION(bluestore_fsck_on_mount, OPT_BOOL, false)
 OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL, true)
 OPTION(bluestore_fsck_on_umount, OPT_BOOL, false)
index 989a375e5171665c786abb3b75a60ba3f52aa36c..f3e743d4d7810397e1a74d49f2921e9e3776a9ee 100644 (file)
@@ -277,7 +277,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
     dout(10) << __func__ << " using custom Env " << priv << dendl;
     opt.env = static_cast<rocksdb::Env*>(priv);
   }
-
+  
   auto cache = rocksdb::NewLRUCache(g_conf->rocksdb_cache_size, g_conf->rocksdb_cache_shard_bits);
   rocksdb::BlockBasedTableOptions bbt_opts;
   bbt_opts.block_size = g_conf->rocksdb_block_size;
@@ -293,7 +293,7 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
     derr << status.ToString() << dendl;
     return -EINVAL;
   }
-
+  
   PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last);
   plb.add_u64_counter(l_rocksdb_gets, "get", "Gets");
   plb.add_u64_counter(l_rocksdb_txns, "submit_transaction", "Submit transactions");
@@ -305,6 +305,11 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
   plb.add_u64_counter(l_rocksdb_compact_range, "compact_range", "Compactions by range");
   plb.add_u64_counter(l_rocksdb_compact_queue_merge, "compact_queue_merge", "Mergings of ranges in compaction queue");
   plb.add_u64(l_rocksdb_compact_queue_len, "compact_queue_len", "Length of compaction queue");
+  plb.add_time_avg(l_rocksdb_write_wal_time, "rocksdb_write_wal_time", "Rocksdb write wal time");
+  plb.add_time_avg(l_rocksdb_write_memtable_time, "rocksdb_write_memtable_time", "Rocksdb write memtable time");
+  plb.add_time_avg(l_rocksdb_write_delay_time, "rocksdb_write_delay_time", "Rocksdb write delay time");
+  plb.add_time_avg(l_rocksdb_write_pre_and_post_process_time, 
+      "rocksdb_write_pre_and_post_time", "total time spent on writing a record, excluding write process");
   logger = plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 
@@ -361,6 +366,13 @@ void RocksDBStore::close()
 int RocksDBStore::submit_transaction(KeyValueDB::Transaction t)
 {
   utime_t start = ceph_clock_now(g_ceph_context);
+  // enable rocksdb breakdown
+  // considering performance overhead, default is disabled
+  if (g_conf->bluestore_rocksdb_perf) {
+    rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+    rocksdb::perf_context.Reset();
+  }
+
   RocksDBTransactionImpl * _t =
     static_cast<RocksDBTransactionImpl *>(t.get());
   rocksdb::WriteOptions woptions;
@@ -378,14 +390,39 @@ int RocksDBStore::submit_transaction(KeyValueDB::Transaction t)
          << " Rocksdb transaction: " << rocks_txc.seen << dendl;
   }
   utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  utime_t write_memtable_time;
+  utime_t write_delay_time;
+  utime_t write_wal_time;
+  utime_t write_pre_and_post_process_time;
+  write_wal_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_wal_time)/1000000000);
+  write_memtable_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_memtable_time)/1000000000);
+  write_delay_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_delay_time)/1000000000);
+  write_pre_and_post_process_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_pre_and_post_process_time)/1000000000);
+
   logger->inc(l_rocksdb_txns);
   logger->tinc(l_rocksdb_submit_latency, lat);
+  logger->tinc(l_rocksdb_write_memtable_time, write_memtable_time);
+  logger->tinc(l_rocksdb_write_delay_time, write_delay_time);
+  logger->tinc(l_rocksdb_write_wal_time, write_wal_time);
+  logger->tinc(l_rocksdb_write_pre_and_post_process_time, write_pre_and_post_process_time);
+
   return s.ok() ? 0 : -1;
 }
 
 int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
 {
   utime_t start = ceph_clock_now(g_ceph_context);
+  // enable rocksdb breakdown
+  // considering performance overhead, default is disabled
+  if (g_conf->bluestore_rocksdb_perf) {
+    rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
+    rocksdb::perf_context.Reset();
+  }
+
   RocksDBTransactionImpl * _t =
     static_cast<RocksDBTransactionImpl *>(t.get());
   rocksdb::WriteOptions woptions;
@@ -404,8 +441,26 @@ int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
          << " Rocksdb transaction: " << rocks_txc.seen << dendl;
   }
   utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  utime_t write_memtable_time;
+  utime_t write_delay_time;
+  utime_t write_wal_time;
+  utime_t write_pre_and_post_process_time;
+  write_wal_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_wal_time)/1000000000);
+  write_memtable_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_memtable_time)/1000000000);
+  write_delay_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_delay_time)/1000000000);
+  write_pre_and_post_process_time.set_from_double(
+      static_cast<double>(rocksdb::perf_context.write_pre_and_post_process_time)/1000000000);
+
   logger->inc(l_rocksdb_txns_sync);
   logger->tinc(l_rocksdb_submit_sync_latency, lat);
+  logger->tinc(l_rocksdb_write_memtable_time, write_memtable_time);
+  logger->tinc(l_rocksdb_write_delay_time, write_delay_time);
+  logger->tinc(l_rocksdb_write_wal_time, write_wal_time);
+  logger->tinc(l_rocksdb_write_pre_and_post_process_time, write_pre_and_post_process_time);
+
   return s.ok() ? 0 : -1;
 }
 int RocksDBStore::get_info_log_level(string info_log_level)
index 371ca6ca6dece7eb3b2f9fb10cd0c564ca558084..6c43bf1eb71e76a03ef469b4c20af0994f917760 100644 (file)
@@ -12,6 +12,7 @@
 #include <memory>
 #include <boost/scoped_ptr.hpp>
 #include "rocksdb/write_batch.h"
+#include "rocksdb/perf_context.h"
 #include <errno.h>
 #include "common/errno.h"
 #include "common/dout.h"
@@ -34,6 +35,10 @@ enum {
   l_rocksdb_compact_range,
   l_rocksdb_compact_queue_merge,
   l_rocksdb_compact_queue_len,
+  l_rocksdb_write_wal_time,
+  l_rocksdb_write_memtable_time,
+  l_rocksdb_write_delay_time,
+  l_rocksdb_write_pre_and_post_process_time,
   l_rocksdb_last,
 };