WritePrepared Txn: add stats

author Maysam Yabandeh <myabandeh@fb.com>

Sun, 8 Apr 2018 04:55:42 +0000 (21:55 -0700)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Sun, 8 Apr 2018 04:56:42 +0000 (21:56 -0700)
author Maysam Yabandeh <myabandeh@fb.com>
Sun, 8 Apr 2018 04:55:42 +0000 (21:55 -0700)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Sun, 8 Apr 2018 04:56:42 +0000 (21:56 -0700)
diff --git a/db/db_impl.h b/db/db_impl.h

index 2d386faa7ef74e274ffe27ab9f35e5b478f5bd75..f7af543491c24370c73c4a40a683f2478840a49d 100644 (file)
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -691,14 +691,20 @@ class DBImpl : public DB {
    void EraseThreadStatusDbInfo() const;
  
    // If disable_memtable is set the application logic must guarantee that the
-  // batch will still be skipped from memtable during the recovery. In
-  // WriteCommitted it is guarnateed since disable_memtable is used for prepare
-  // batch which will be written to memtable later during the commit, and in
-  // WritePrepared it is guaranteed since it will be used only for WAL markers
-  // which will never be written to memtable.
-  // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
-  // the number of sub-patches. A sub-patch is a subset of the write batch that
-  // does not have duplicate keys.
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
    Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                     WriteCallback* callback = nullptr,
                     uint64_t* log_used = nullptr, uint64_t log_ref = 0,
diff --git a/db/memtable.cc b/db/memtable.cc

index 46779013536f2b1813ea85706388e052630dba3b..f2d2881d9cbeaa80a9b41f10d28b7d9c3b27aa50 100644 (file)
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -479,12 +479,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
          insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
        Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
        bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
-      if (!res) {
+      if (UNLIKELY(!res)) {
          return res;
        }
      } else {
        bool res = table->InsertKey(handle);
-      if (!res) {
+      if (UNLIKELY(!res)) {
          return res;
        }
      }
@@ -520,7 +520,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
      UpdateFlushState();
    } else {
      bool res = table->InsertKeyConcurrently(handle);
-    if (!res) {
+    if (UNLIKELY(!res)) {
        return res;
      }
  
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h

index c91a86af52771d8f2d1ee106aa1683621d3517a7..828a4d0eac40119b2bebd4c4347ee370026f5422 100644 (file)
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -309,6 +309,17 @@ enum Tickers : uint32_t {
    // # of bytes in the blob files evicted because of BlobDB is full.
    BLOB_DB_FIFO_BYTES_EVICTED,
  
+  // These coutners indicate a performance issue in WritePrepared transactions.
+  // We should not seem them ticking them much.
+  // # of times prepare_mutex_ is acquired in the fast path.
+  TXN_PREPARE_MUTEX_OVERHEAD,
+  // # of times old_commit_map_mutex_ is acquired in the fast path.
+  TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+  // # of times we checked a batch for duplicate keys.
+  TXN_DUPLICATE_KEY_OVERHEAD,
+  // # of times snapshot_mutex_ is acquired in the fast path.
+  TXN_SNAPSHOT_MUTEX_OVERHEAD,
+
    TICKER_ENUM_MAX
  };
  
@@ -455,6 +466,11 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
      {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
      {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+     "rocksdb.txn.overhead.mutex.old.commit.map"},
+    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
  };
  
  /**
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h

index 1482b246662a0a74a34294bdd67089410bff190b..856f489965fd721390c8ca69e23791c9e4ab2039 100644 (file)
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -98,9 +98,10 @@ struct TransactionOptions {
    bool deadlock_detect = false;
  
    // If set, it states that the CommitTimeWriteBatch represents the latest state
-  // of the application and meant to be used later during recovery. It enables
-  // an optimization to postpone updating the memtable with CommitTimeWriteBatch
-  // to only SwitchMemtable or recovery.
+  // of the application, has only one sub-batch, i.e., no duplicate keys,  and
+  // meant to be used later during recovery. It enables an optimization to
+  // postpone updating the memtable with CommitTimeWriteBatch to only
+  // SwitchMemtable or recovery.
    bool use_only_the_last_commit_time_batch_for_recovery = false;
  
    // TODO(agiardullo): TransactionDB does not yet support comparators that allow
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc

index 89a60d537be176c0a7d933522186d053fc470eef..13acd0a27a196cfd44e81638c59ae734a9fbcb9f 100644 (file)
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -131,9 +131,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
      auto s = batch->Iterate(&counter);
      assert(s.ok());
      batch_cnt = counter.BatchCount();
-    // TODO(myabandeh): replace me with a stat
-    ROCKS_LOG_WARN(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
-                   static_cast<uint64_t>(batch_cnt));
+    WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+    ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+                      static_cast<uint64_t>(batch_cnt));
    }
    assert(batch_cnt);
  
@@ -506,7 +506,6 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
      while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
        auto to_be_popped = prepared_txns_.top();
        delayed_prepared_.insert(to_be_popped);
-      // TODO(myabandeh): also add a stat
        ROCKS_LOG_WARN(info_log_,
                       "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
                       " new_max=%" PRIu64 " oldmax=%" PRIu64,
@@ -574,14 +573,14 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal(
      // old_commit_map_. Check and do garbage collection if that is the case.
      bool need_gc = false;
      {
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
        ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
        ReadLock rl(&old_commit_map_mutex_);
        auto prep_set_entry = old_commit_map_.find(snap_seq);
        need_gc = prep_set_entry != old_commit_map_.end();
      }
      if (need_gc) {
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
        ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
        WriteLock wl(&old_commit_map_mutex_);
        old_commit_map_.erase(snap_seq);
@@ -601,8 +600,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
  #ifndef NDEBUG
    size_t sync_i = 0;
  #endif
-  // TODO(myabandeh): replace me with a stat
-  ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
+  ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
    WriteLock wl(&snapshots_mutex_);
    snapshots_version_ = version;
    // We update the list concurrently with the readers.
@@ -682,7 +680,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
    if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
                 snapshot_seq < evicted.prep_seq)) {
      // Then access the less efficient list of snapshots_
-    // TODO(myabandeh): also add a stat
+    WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
      ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
      ReadLock rl(&snapshots_mutex_);
      // Items could have moved from the snapshots_ to snapshot_cache_ before
@@ -716,8 +714,8 @@ bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
    }
    // then snapshot_seq < commit_seq
    if (prep_seq <= snapshot_seq) {  // overlapping range
+    WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
-    // TODO(myabandeh): also add a stat
      WriteLock wl(&old_commit_map_mutex_);
      old_commit_map_empty_.store(false, std::memory_order_release);
      auto& vec = old_commit_map_[snapshot_seq];
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h

index b620db077c804281f59fde12cd9b6acb26dd84f5..70a62b03f42f34c919bd6503bfe13efadcd33218 100644 (file)
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -144,8 +144,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
      }
      if (!delayed_prepared_empty_.load(std::memory_order_acquire)) {
        // We should not normally reach here
+      WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
        ReadLock rl(&prepared_mutex_);
-      // TODO(myabandeh): also add a stat
        ROCKS_LOG_WARN(info_log_, "prepared_mutex_ overhead %" PRIu64,
                       static_cast<uint64_t>(delayed_prepared_.size()));
        if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
@@ -216,7 +216,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
        // We should not normally reach here unless sapshot_seq is old. This is a
        // rare case and it is ok to pay the cost of mutex ReadLock for such old,
        // reading transactions.
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
        ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
        ReadLock rl(&old_commit_map_mutex_);
        auto prep_set_entry = old_commit_map_.find(snapshot_seq);
@@ -381,6 +381,10 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
  
    void Init(const TransactionDBOptions& /* unused */);
  
+  void WPRecordTick(uint32_t ticker_type) const {
+    RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+  }
+
    // A heap with the amortized O(1) complexity for erase. It uses one extra heap
    // to keep track of erased entries that are not yet on top of the main heap.
    class PreparedHeap {
author	Maysam Yabandeh <myabandeh@fb.com>
	Sun, 8 Apr 2018 04:55:42 +0000 (21:55 -0700)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Sun, 8 Apr 2018 04:56:42 +0000 (21:56 -0700)
db/db_impl.h		patch \| blob \| history
db/memtable.cc		patch \| blob \| history
include/rocksdb/statistics.h		patch \| blob \| history
include/rocksdb/utilities/transaction_db.h		patch \| blob \| history
utilities/transactions/write_prepared_txn_db.cc		patch \| blob \| history
utilities/transactions/write_prepared_txn_db.h		patch \| blob \| history