[RocksDB] Added nano second stopwatch and new perf counters to track block read cost

author Haobo Xu <haobo@fb.com>

Tue, 4 Jun 2013 06:09:15 +0000 (23:09 -0700)

committer Haobo Xu <haobo@fb.com>

Sun, 8 Sep 2013 04:14:54 +0000 (21:14 -0700)
author Haobo Xu <haobo@fb.com>
Tue, 4 Jun 2013 06:09:15 +0000 (23:09 -0700)
committer Haobo Xu <haobo@fb.com>
Sun, 8 Sep 2013 04:14:54 +0000 (21:14 -0700)
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform

index ecc408fe2c3466a5f11995facd68b12a0559ce1b..884f2e8363181b2246070d7c35060f1a8e27ce83 100755 (executable)
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -82,7 +82,7 @@ case "$TARGET_OS" in
          if [ -z "$USE_CLANG" ]; then
              COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
          fi
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
          # PORT_FILES=port/linux/linux_specific.cc
          ;;
      SunOS)
diff --git a/db/dbformat.cc b/db/dbformat.cc

index 8456ba2f243aa54a51cc0c60b6e962584f495a4e..c1caa955ffd0395fb9917ff9648463dbea86792b 100644 (file)
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -6,7 +6,7 @@
  #include "db/dbformat.h"
  #include "port/port.h"
  #include "util/coding.h"
-#include "rocksdb/perf_context.h"
+#include "util/perf_context_imp.h"
  
  namespace leveldb {
  
@@ -54,7 +54,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
    //    decreasing sequence number
    //    decreasing type (though sequence# should be enough to disambiguate)
    int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
-  perf_context.user_key_comparison_count++;
+  BumpPerfCount(&perf_context.user_key_comparison_count);
    if (r == 0) {
      const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
      const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
diff --git a/db/perf_context.cc b/db/perf_context.cc

deleted file mode 100644 (file)

index 08b5d8c..0000000
--- a/db/perf_context.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "rocksdb/perf_context.h"
-
-
-namespace leveldb {
-
-void PerfContext::Reset() {
-  user_key_comparison_count = 0;
-}
-
-__thread PerfContext perf_context;
-
-}
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc

index cccbb356bd3ff9edf6cb9fe65f4d47b4a080a5b3..95b1880227856e1c12d79d9340c189f9cf990c96 100644 (file)
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -1,7 +1,10 @@
  #include <iostream>
+#include <vector>
  
  #include "rocksdb/db.h"
  #include "rocksdb/perf_context.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
  #include "util/testharness.h"
  
  
@@ -10,11 +13,11 @@ namespace leveldb {
  // Path to the database on file system
  const std::string kDbName = test::TmpDir() + "/perf_context_test";
  
-std::shared_ptr<DB> OpenDb() {
+std::shared_ptr<DB> OpenDb(size_t write_buffer_size) {
      DB* db;
      Options options;
      options.create_if_missing = true;
-    options.write_buffer_size = 1000000000;     // give it a big memtable
+    options.write_buffer_size = write_buffer_size;
      Status s = DB::Open(options, kDbName,  &db);
      ASSERT_OK(s);
      return std::shared_ptr<DB>(db);
@@ -24,11 +27,48 @@ class PerfContextTest { };
  
  int kTotalKeys = 100;
  
-TEST(PerfContextTest, KeyComparisonCount) {
+TEST(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  std::cout << histogram.ToString();
+}
+
+TEST(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default());
+  for (auto& timing : timings) {
+    timing = timer.ElapsedMicros();
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  std::cout << histogram.ToString();
+}
  
+void ProfileKeyComparison() {
    DestroyDB(kDbName, Options());    // Start this test with a fresh DB
  
-  auto db = OpenDb();
+  auto db = OpenDb(1000000000);
  
    WriteOptions write_options;
    ReadOptions read_options;
@@ -63,12 +103,20 @@ TEST(PerfContextTest, KeyComparisonCount) {
              << max_user_key_comparison_get << "\n"
              << "avg user key comparison get:"
              << total_user_key_comparison_get/kTotalKeys << "\n";
-
  }
  
+TEST(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kDisable);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kEnableCount);
+  ProfileKeyComparison();
  
+  SetPerfLevel(kEnableTime);
+  ProfileKeyComparison();
  }
  
+}
  
  int main(int argc, char** argv) {
  
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h

index ebfe4021bc1b9c3194cb66c5fde865bd43454145..f4803e2e560300a3817681b75844389e2b3b1ed4 100644 (file)
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -190,6 +190,13 @@ class Env {
    // useful for computing deltas of time.
    virtual uint64_t NowMicros() = 0;
  
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
    // Sleep/delay the thread for the perscribed number of micro-seconds.
    virtual void SleepForMicroseconds(int micros) = 0;
  
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h

index 953a160a6e5128620fbff7900e5e689a14a2e897..45566ff19f8d04077377ec7ccd29d58350828ac8 100644 (file)
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -5,6 +5,15 @@
  
  namespace leveldb {
  
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level
+void SetPerfLevel(PerfLevel level);
+
  // A thread local context for gathering performance counter efficiently
  // and transparently.
  
@@ -12,8 +21,13 @@ struct PerfContext {
  
    void Reset(); // reset all performance counters to zero
  
-
    uint64_t user_key_comparison_count; // total number of user key comparisons
+  uint64_t block_cache_hit_count;
+  uint64_t block_read_count;
+  uint64_t block_read_byte;
+  uint64_t block_read_time;
+  uint64_t block_checksum_time;
+  uint64_t block_decompress_time;
  };
  
  extern __thread PerfContext perf_context;
diff --git a/table/format.cc b/table/format.cc

index e2e6103bdb380e39b6e1bbc38df8d1521304186f..60467297f5631cba02560ab3060f88f9cdd946fb 100644 (file)
--- a/table/format.cc
+++ b/table/format.cc
@@ -4,11 +4,12 @@
  
  #include "table/format.h"
  
-#include "rocksdb/env.h"
  #include "port/port.h"
+#include "rocksdb/env.h"
  #include "table/block.h"
  #include "util/coding.h"
  #include "util/crc32c.h"
+#include "util/perf_context_imp.h"
  
  namespace leveldb {
  
@@ -69,7 +70,8 @@ Status Footer::DecodeFrom(Slice* input) {
  Status ReadBlockContents(RandomAccessFile* file,
                           const ReadOptions& options,
                           const BlockHandle& handle,
-                         BlockContents* result) {
+                         BlockContents* result,
+                         Env* env) {
    result->data = Slice();
    result->cachable = false;
    result->heap_allocated = false;
@@ -79,7 +81,14 @@ Status ReadBlockContents(RandomAccessFile* file,
    size_t n = static_cast<size_t>(handle.size());
    char* buf = new char[n + kBlockTrailerSize];
    Slice contents;
+
+  StopWatchNano timer(env);
+  StartPerfTimer(&timer);
    Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
+  BumpPerfCount(&perf_context.block_read_count);
+  BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize);
+  BumpPerfTime(&perf_context.block_read_time, &timer);
+
    if (!s.ok()) {
      delete[] buf;
      return s;
@@ -99,6 +108,7 @@ Status ReadBlockContents(RandomAccessFile* file,
        s = Status::Corruption("block checksum mismatch");
        return s;
      }
+    BumpPerfTime(&perf_context.block_checksum_time, &timer);
    }
  
    char* ubuf = nullptr;
@@ -172,6 +182,8 @@ Status ReadBlockContents(RandomAccessFile* file,
        return Status::Corruption("bad block type");
    }
  
+  BumpPerfTime(&perf_context.block_decompress_time, &timer);
+
    return Status::OK();
  }
  
diff --git a/table/format.h b/table/format.h

index 88008dafd19668f6f2c85efa3bd3e39881d8de80..31fa4f6b655adc44221d7d336177445d66326ed3 100644 (file)
--- a/table/format.h
+++ b/table/format.h
@@ -94,7 +94,8 @@ struct BlockContents {
  extern Status ReadBlockContents(RandomAccessFile* file,
                                  const ReadOptions& options,
                                  const BlockHandle& handle,
-                                BlockContents* result);
+                                BlockContents* result,
+                                Env* env);
  
  // Implementation details follow.  Clients should ignore,
  
diff --git a/table/table.cc b/table/table.cc

index d2d198fb813bafe1ef57ee9aadac305c267cb628..6b59b3ce334784313a6600d7f990e25aaba891ea 100644 (file)
--- a/table/table.cc
+++ b/table/table.cc
@@ -19,6 +19,7 @@
  #include "table/two_level_iterator.h"
  
  #include "util/coding.h"
+#include "util/perf_context_imp.h"
  #include "util/stop_watch.h"
  
  namespace leveldb {
@@ -81,9 +82,10 @@ Status ReadBlock(RandomAccessFile* file,
                   const ReadOptions& options,
                   const BlockHandle& handle,
                   Block** result,
+                 Env* env,
                   bool* didIO = nullptr) {
    BlockContents contents;
-  Status s = ReadBlockContents(file, options, handle, &contents);
+  Status s = ReadBlockContents(file, options, handle, &contents, env);
    if (s.ok()) {
      *result = new Block(contents);
    }
@@ -118,14 +120,14 @@ Status Table::Open(const Options& options,
      return Status::InvalidArgument("file is too short to be an sstable");
    }
  
-
    Footer footer;
    s = footer.DecodeFrom(&footer_input);
    if (!s.ok()) return s;
  
    Block* index_block = nullptr;
    // TODO: we never really verify check sum for index block
-  s = ReadBlock(file.get(), ReadOptions(), footer.index_handle(), &index_block);
+  s = ReadBlock(file.get(), ReadOptions(), footer.index_handle(), &index_block,
+                options.env);
  
    if (s.ok()) {
      // We've successfully read the footer and the index block: we're
@@ -176,7 +178,7 @@ void Table::ReadMeta(const Footer& footer) {
    //  TODO: we never really verify check sum for meta index block
    Block* meta = nullptr;
    if (!ReadBlock(rep_->file.get(), ReadOptions(), footer.metaindex_handle(),
-                 &meta).ok()) {
+                 &meta, rep_->options.env).ok()) {
      // Do not propagate errors since meta info is not needed for operation
      return;
    }
@@ -203,7 +205,8 @@ void Table::ReadFilter(const Slice& filter_handle_value) {
    // requiring checksum verification in Table::Open.
    ReadOptions opt;
    BlockContents block;
-  if (!ReadBlockContents(rep_->file.get(), opt, filter_handle, &block).ok()) {
+  if (!ReadBlockContents(rep_->file.get(), opt, filter_handle, &block,
+                        rep_->options.env).ok()) {
      return;
    }
    if (block.heap_allocated) {
@@ -266,6 +269,7 @@ Iterator* Table::BlockReader(void* arg,
        if (cache_handle != nullptr) {
          block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
  
+        BumpPerfCount(&perf_context.block_cache_hit_count);
          RecordTick(statistics, BLOCK_CACHE_HIT);
        } else if (no_io) {
          // Did not find in block_cache and can't do IO
@@ -280,6 +284,7 @@ Iterator* Table::BlockReader(void* arg,
                  options,
                  handle,
                  &block,
+                table->rep_->options.env,
                  didIO
                );
          }
@@ -295,8 +300,9 @@ Iterator* Table::BlockReader(void* arg,
      } else if (no_io) {
        // Could not read from block_cache and can't do IO
        return NewErrorIterator(Status::Incomplete("no blocking io"));
-    }else {
-      s = ReadBlock(table->rep_->file.get(), options, handle, &block, didIO);
+    } else {
+      s = ReadBlock(table->rep_->file.get(), options, handle, &block,
+                    table->rep_->options.env, didIO);
      }
    }
  
diff --git a/util/env_posix.cc b/util/env_posix.cc

index 59314fefb1d8fc01dbcf663ec32a70a3a304eaac..3771c406dfcfd4cdac1a30bbec46128c41372515 100644 (file)
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -960,6 +960,12 @@ class PosixEnv : public Env {
      return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
    }
  
+  virtual uint64_t NowNanos() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+  }
+
    virtual void SleepForMicroseconds(int micros) {
      usleep(micros);
    }
diff --git a/util/perf_context.cc b/util/perf_context.cc

new file mode 100644 (file)

index 0000000..48525d2
--- /dev/null
+++ b/util/perf_context.cc
@@ -0,0 +1,22 @@
+#include "util/perf_context_imp.h"
+
+namespace leveldb {
+
+// by default, enable counts only
+PerfLevel perf_level = kEnableCount;
+
+void SetPerfLevel(PerfLevel level) { perf_level = level; }
+
+void PerfContext::Reset() {
+  user_key_comparison_count = 0;
+  block_cache_hit_count = 0;
+  block_read_count = 0;
+  block_read_byte = 0;
+  block_read_time = 0;
+  block_checksum_time = 0;
+  block_decompress_time = 0;
+}
+
+__thread PerfContext perf_context;
+
+}
diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h

new file mode 100644 (file)

index 0000000..66df1c8
--- /dev/null
+++ b/util/perf_context_imp.h
@@ -0,0 +1,33 @@
+#ifndef PERF_CONTEXT_IMP_H
+#define PERF_CONTEXT_IMP_H
+
+#include "rocksdb/perf_context.h"
+#include "util/stop_watch.h"
+
+namespace leveldb {
+
+extern enum PerfLevel perf_level;
+
+inline void StartPerfTimer(StopWatchNano* timer) {
+  if (perf_level >= PerfLevel::kEnableTime) {
+    timer->Start();
+  }
+}
+
+inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) {
+  if (perf_level >= PerfLevel::kEnableCount) {
+    *count += delta;
+  }
+}
+
+inline void BumpPerfTime(uint64_t* time,
+                         StopWatchNano* timer,
+                         bool reset = true) {
+  if (perf_level >= PerfLevel::kEnableTime) {
+    *time += timer->ElapsedNanos(reset);
+  }
+}
+
+}
+
+#endif
diff --git a/util/stop_watch.h b/util/stop_watch.h

index ac0c02b9b815e4240fcd5d7a2aeb67650d80b859..8ccf2a006495cef3e963a20ca2f155ec1bb4d8b5 100644 (file)
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -11,8 +11,8 @@ class StopWatch {
   public:
    StopWatch(
      Env * const env,
-    std::shared_ptr<Statistics> statistics,
-    const Histograms histogram_name) :
+    std::shared_ptr<Statistics> statistics = nullptr,
+    const Histograms histogram_name = DB_GET) :
        env_(env),
        start_time_(env->NowMicros()),
        statistics_(statistics),
@@ -37,5 +37,32 @@ class StopWatch {
    const Histograms histogram_name_;
  
  };
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+  StopWatchNano(Env* const env, bool auto_start = false)
+      : env_(env), start_(0) {
+    if (auto_start) {
+      Start();
+    }
+  }
+
+  void Start() { start_ = env_->NowNanos(); }
+
+  uint64_t ElapsedNanos(bool reset = false) {
+    auto now = env_->NowNanos();
+    auto elapsed = now - start_;
+    if (reset) {
+      start_ = now;
+    }
+    return elapsed;
+  }
+
+ private:
+  Env* const env_;
+  uint64_t start_;
+};
+
  } // namespace leveldb
  #endif // STORAGE_LEVELDB_UTIL_STOP_WATCH_H_
author	Haobo Xu <haobo@fb.com>
	Tue, 4 Jun 2013 06:09:15 +0000 (23:09 -0700)
committer	Haobo Xu <haobo@fb.com>
	Sun, 8 Sep 2013 04:14:54 +0000 (21:14 -0700)
build_tools/build_detect_platform		patch \| blob \| history
db/dbformat.cc		patch \| blob \| history
db/perf_context.cc	[deleted file]	patch \| blob \| history
db/perf_context_test.cc		patch \| blob \| history
include/rocksdb/env.h		patch \| blob \| history
include/rocksdb/perf_context.h		patch \| blob \| history
table/format.cc		patch \| blob \| history
table/format.h		patch \| blob \| history
table/table.cc		patch \| blob \| history
util/env_posix.cc		patch \| blob \| history
util/perf_context.cc	[new file with mode: 0644]	patch \| blob
util/perf_context_imp.h	[new file with mode: 0644]	patch \| blob
util/stop_watch.h		patch \| blob \| history