void EraseThreadStatusDbInfo() const;
// If disable_memtable is set the application logic must guarantee that the
- // batch will still be skipped from memtable during the recovery. In
- // WriteCommitted it is guarnateed since disable_memtable is used for prepare
- // batch which will be written to memtable later during the commit, and in
- // WritePrepared it is guaranteed since it will be used only for WAL markers
- // which will never be written to memtable.
- // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
- // the number of sub-patches. A sub-patch is a subset of the write batch that
- // does not have duplicate keys.
+ // batch will still be skipped from memtable during the recovery. An excption
+ // to this is seq_per_batch_ mode, in which since each batch already takes one
+ // seq, it is ok for the batch to write to memtable during recovery as long as
+ // it only takes one sequence number: i.e., no duplicate keys.
+ // In WriteCommitted it is guarnateed since disable_memtable is used for
+ // prepare batch which will be written to memtable later during the commit,
+ // and in WritePrepared it is guaranteed since it will be used only for WAL
+ // markers which will never be written to memtable. If the commit marker is
+ // accompanied with CommitTimeWriteBatch that is not written to memtable as
+ // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+ // policy.
+ // batch_cnt is expected to be non-zero in seq_per_batch mode and
+ // indicates the number of sub-patches. A sub-patch is a subset of the write
+ // batch that does not have duplicate keys.
Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
WriteCallback* callback = nullptr,
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
- if (!res) {
+ if (UNLIKELY(!res)) {
return res;
}
} else {
bool res = table->InsertKey(handle);
- if (!res) {
+ if (UNLIKELY(!res)) {
return res;
}
}
UpdateFlushState();
} else {
bool res = table->InsertKeyConcurrently(handle);
- if (!res) {
+ if (UNLIKELY(!res)) {
return res;
}
// # of bytes in the blob files evicted because of BlobDB is full.
BLOB_DB_FIFO_BYTES_EVICTED,
+ // These coutners indicate a performance issue in WritePrepared transactions.
+ // We should not seem them ticking them much.
+ // # of times prepare_mutex_ is acquired in the fast path.
+ TXN_PREPARE_MUTEX_OVERHEAD,
+ // # of times old_commit_map_mutex_ is acquired in the fast path.
+ TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+ // # of times we checked a batch for duplicate keys.
+ TXN_DUPLICATE_KEY_OVERHEAD,
+ // # of times snapshot_mutex_ is acquired in the fast path.
+ TXN_SNAPSHOT_MUTEX_OVERHEAD,
+
TICKER_ENUM_MAX
};
{BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
{BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
{BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+ {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+ {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+ "rocksdb.txn.overhead.mutex.old.commit.map"},
+ {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+ {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
};
/**
bool deadlock_detect = false;
// If set, it states that the CommitTimeWriteBatch represents the latest state
- // of the application and meant to be used later during recovery. It enables
- // an optimization to postpone updating the memtable with CommitTimeWriteBatch
- // to only SwitchMemtable or recovery.
+ // of the application, has only one sub-batch, i.e., no duplicate keys, and
+ // meant to be used later during recovery. It enables an optimization to
+ // postpone updating the memtable with CommitTimeWriteBatch to only
+ // SwitchMemtable or recovery.
bool use_only_the_last_commit_time_batch_for_recovery = false;
// TODO(agiardullo): TransactionDB does not yet support comparators that allow
auto s = batch->Iterate(&counter);
assert(s.ok());
batch_cnt = counter.BatchCount();
- // TODO(myabandeh): replace me with a stat
- ROCKS_LOG_WARN(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
- static_cast<uint64_t>(batch_cnt));
+ WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+ ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+ static_cast<uint64_t>(batch_cnt));
}
assert(batch_cnt);
while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
auto to_be_popped = prepared_txns_.top();
delayed_prepared_.insert(to_be_popped);
- // TODO(myabandeh): also add a stat
ROCKS_LOG_WARN(info_log_,
"prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
" new_max=%" PRIu64 " oldmax=%" PRIu64,
// old_commit_map_. Check and do garbage collection if that is the case.
bool need_gc = false;
{
- // TODO(myabandeh): also add a stat
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
ReadLock rl(&old_commit_map_mutex_);
auto prep_set_entry = old_commit_map_.find(snap_seq);
need_gc = prep_set_entry != old_commit_map_.end();
}
if (need_gc) {
- // TODO(myabandeh): also add a stat
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
WriteLock wl(&old_commit_map_mutex_);
old_commit_map_.erase(snap_seq);
#ifndef NDEBUG
size_t sync_i = 0;
#endif
- // TODO(myabandeh): replace me with a stat
- ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
+ ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
WriteLock wl(&snapshots_mutex_);
snapshots_version_ = version;
// We update the list concurrently with the readers.
if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
snapshot_seq < evicted.prep_seq)) {
// Then access the less efficient list of snapshots_
- // TODO(myabandeh): also add a stat
+ WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
ReadLock rl(&snapshots_mutex_);
// Items could have moved from the snapshots_ to snapshot_cache_ before
}
// then snapshot_seq < commit_seq
if (prep_seq <= snapshot_seq) { // overlapping range
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
- // TODO(myabandeh): also add a stat
WriteLock wl(&old_commit_map_mutex_);
old_commit_map_empty_.store(false, std::memory_order_release);
auto& vec = old_commit_map_[snapshot_seq];
}
if (!delayed_prepared_empty_.load(std::memory_order_acquire)) {
// We should not normally reach here
+ WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
ReadLock rl(&prepared_mutex_);
- // TODO(myabandeh): also add a stat
ROCKS_LOG_WARN(info_log_, "prepared_mutex_ overhead %" PRIu64,
static_cast<uint64_t>(delayed_prepared_.size()));
if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
// We should not normally reach here unless sapshot_seq is old. This is a
// rare case and it is ok to pay the cost of mutex ReadLock for such old,
// reading transactions.
- // TODO(myabandeh): also add a stat
+ WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
ReadLock rl(&old_commit_map_mutex_);
auto prep_set_entry = old_commit_map_.find(snapshot_seq);
void Init(const TransactionDBOptions& /* unused */);
+ void WPRecordTick(uint32_t ticker_type) const {
+ RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+ }
+
// A heap with the amortized O(1) complexity for erase. It uses one extra heap
// to keep track of erased entries that are not yet on top of the main heap.
class PreparedHeap {