* Add stats for ReadAsync time spent and async read errors.
* Basic support for the wide-column data model is now available. Wide-column entities can be stored using the `PutEntity` API, and retrieved using `GetEntity` and the new `columns` API of iterator. For compatibility, the classic APIs `Get` and `MultiGet`, as well as iterator's `value` API return the value of the anonymous default column of wide-column entities; also, `GetEntity` and iterator's `columns` return any plain key-values in the form of an entity which only has the anonymous default column. `Merge` (and `GetMergeOperands`) currently also apply to the default column; any other columns of entities are unaffected by `Merge` operations. Note that some features like compaction filters, transactions, user-defined timestamps, and the SST file writer do not yet support wide-column entities; also, there is currently no `MultiGet`-like API to retrieve multiple entities at once. We plan to gradually close the above gaps and also implement new features like column-level operations (e.g. updating or querying only certain columns of an entity).
* Marked HyperClockCache as a production-ready alternative to LRUCache for the block cache. HyperClockCache greatly improves hot-path CPU efficiency under high parallel load or high contention, with some documented caveats and limitations. As much as 4.5x higher ops/sec vs. LRUCache has been seen in db_bench under high parallel load.
+* Add periodic diagnostics to info_log (LOG file) for HyperClockCache block cache if performance is degraded by bad `estimated_entry_charge` option.
### Public API Changes
* Marked `block_cache_compressed` as a deprecated feature. Use SecondaryCache instead.
#include <cassert>
#include <functional>
+#include <numeric>
#include "cache/cache_key.h"
+#include "logging/logging.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "port/lang.h"
return table_.GetUsage();
}
+template <class Table>
+size_t ClockCacheShard<Table>::GetDetachedUsage() const {
+ return table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetCapacity() const {
+ return capacity_;
+}
+
template <class Table>
size_t ClockCacheShard<Table>::GetPinnedUsage() const {
// Computes the pinned usage by scanning the whole hash table. This
return table_.GetOccupancy();
}
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
+ return table_.GetOccupancyLimit();
+}
+
template <class Table>
size_t ClockCacheShard<Table>::GetTableAddressCount() const {
return table_.GetTableSize();
return h->deleter;
}
+namespace {
+
+// For each cache shard, estimate what the table load factor would be if
+// cache filled to capacity with average entries. This is considered
+// indicative of a potential problem if the shard is essentially operating
+// "at limit", which we define as high actual usage (>80% of capacity)
+// or actual occupancy very close to limit (>95% of limit).
+// Also, for each shard compute the recommended estimated_entry_charge,
+// and keep the minimum one for use as overall recommendation.
+void AddShardEvaluation(const HyperClockCache::Shard& shard,
+ std::vector<double>& predicted_load_factors,
+ size_t& min_recommendation) {
+ size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
+ size_t capacity = shard.GetCapacity();
+ double usage_ratio = 1.0 * usage / capacity;
+
+ size_t occupancy = shard.GetOccupancyCount();
+ size_t occ_limit = shard.GetOccupancyLimit();
+ double occ_ratio = 1.0 * occupancy / occ_limit;
+ if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
+ // Skip as described above
+ return;
+ }
+
+ // If filled to capacity, what would the occupancy ratio be?
+ double ratio = occ_ratio / usage_ratio;
+ // Given max load factor, what that load factor be?
+ double lf = ratio * kStrictLoadFactor;
+ predicted_load_factors.push_back(lf);
+
+ // Update min_recommendation also
+ size_t recommendation = usage / occupancy;
+ min_recommendation = std::min(min_recommendation, recommendation);
+}
+
+} // namespace
+
+void HyperClockCache::ReportProblems(
+ const std::shared_ptr<Logger>& info_log) const {
+ uint32_t shard_count = GetNumShards();
+ std::vector<double> predicted_load_factors;
+ size_t min_recommendation = SIZE_MAX;
+ const_cast<HyperClockCache*>(this)->ForEachShard(
+ [&](HyperClockCache::Shard* shard) {
+ AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
+ });
+
+ if (predicted_load_factors.empty()) {
+ // None operating "at limit" -> nothing to report
+ return;
+ }
+ std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
+
+ // First, if the average load factor is within spec, we aren't going to
+ // complain about a few shards being out of spec.
+ // NOTE: this is only the average among cache shards operating "at limit,"
+ // which should be representative of what we care about. It it normal, even
+ // desirable, for a cache to operate "at limit" so this should not create
+ // selection bias. See AddShardEvaluation().
+ // TODO: Consider detecting cases where decreasing the number of shards
+ // would be good, e.g. serious imbalance among shards.
+ double average_load_factor =
+ std::accumulate(predicted_load_factors.begin(),
+ predicted_load_factors.end(), 0.0) /
+ shard_count;
+
+ constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
+ constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
+ if (average_load_factor > kLoadFactor) {
+ // Out of spec => Consider reporting load factor too high
+ // Estimate effective overall capacity loss due to enforcing occupancy limit
+ double lost_portion = 0.0;
+ int over_count = 0;
+ for (double lf : predicted_load_factors) {
+ if (lf > kStrictLoadFactor) {
+ ++over_count;
+ lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
+ }
+ }
+ // >= 20% loss -> error
+ // >= 10% loss -> consistent warning
+ // >= 1% loss -> intermittent warning
+ InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+ bool report = true;
+ if (lost_portion > 0.2) {
+ level = InfoLogLevel::ERROR_LEVEL;
+ } else if (lost_portion > 0.1) {
+ level = InfoLogLevel::WARN_LEVEL;
+ } else if (lost_portion > 0.01) {
+ int report_percent = static_cast<int>(lost_portion * 100.0);
+ if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
+ level = InfoLogLevel::WARN_LEVEL;
+ }
+ } else {
+ // don't report
+ report = false;
+ }
+ if (report) {
+ ROCKS_LOG_AT_LEVEL(
+ info_log, level,
+ "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
+ "of "
+ "full occupancy in %d/%u cache shards (estimated_entry_charge too "
+ "high). Recommend estimated_entry_charge=%zu",
+ this, lost_portion * 100.0, over_count, (unsigned)shard_count,
+ min_recommendation);
+ }
+ } else if (average_load_factor < kLowSpecLoadFactor) {
+ // Out of spec => Consider reporting load factor too low
+ // But cautiously because low is not as big of a problem.
+
+ // Only report if highest occupancy shard is also below
+ // spec and only if average is substantially out of spec
+ if (predicted_load_factors.back() < kLowSpecLoadFactor &&
+ average_load_factor < kLowSpecLoadFactor / 1.414) {
+ InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+ if (average_load_factor < kLowSpecLoadFactor / 2) {
+ level = InfoLogLevel::WARN_LEVEL;
+ }
+ ROCKS_LOG_AT_LEVEL(
+ info_log, level,
+ "HyperClockCache@%p table has low occupancy at full capacity. Higher "
+ "estimated_entry_charge (about %.1fx) would likely improve "
+ "performance. Recommend estimated_entry_charge=%zu",
+ this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
+ }
+ }
+}
+
} // namespace clock_cache
// DEPRECATED (see public API)
return occupancy_.load(std::memory_order_relaxed);
}
+ size_t GetOccupancyLimit() const { return occupancy_limit_; }
+
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
size_t GetDetachedUsage() const {
void Erase(const Slice& key, const UniqueId64x2& hashed_key);
+ size_t GetCapacity() const;
+
size_t GetUsage() const;
+ size_t GetDetachedUsage() const;
+
size_t GetPinnedUsage() const;
size_t GetOccupancyCount() const;
+ size_t GetOccupancyLimit() const;
+
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
size_t GetCharge(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
+
+ void ReportProblems(
+ const std::shared_ptr<Logger>& /*info_log*/) const override;
}; // class HyperClockCache
} // namespace clock_cache
capacity,
BlockBasedTableOptions().block_size /*estimated_value_size*/)
.MakeSharedCache()}) {
- if (!cache) {
- // Skip clock cache when not supported
- continue;
- }
-
++iterations_tested;
Options options = CurrentOptions();
}
}
+namespace {
+
+void DummyFillCache(Cache& cache, size_t entry_size,
+ std::vector<CacheHandleGuard<void>>& handles) {
+ // fprintf(stderr, "Entry size: %zu\n", entry_size);
+ handles.clear();
+ cache.EraseUnRefEntries();
+ void* fake_value = &cache;
+ size_t capacity = cache.GetCapacity();
+ OffsetableCacheKey ck{"abc", "abc", 42};
+ for (size_t my_usage = 0; my_usage < capacity;) {
+ size_t charge = std::min(entry_size, capacity - my_usage);
+ Cache::Handle* handle;
+ Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value,
+ charge, /*deleter*/ nullptr, &handle);
+ ASSERT_OK(st);
+ handles.emplace_back(&cache, handle);
+ my_usage += charge;
+ }
+}
+
+class CountingLogger : public Logger {
+ public:
+ ~CountingLogger() override {}
+ using Logger::Logv;
+ void Logv(const InfoLogLevel log_level, const char* format,
+ va_list /*ap*/) override {
+ if (std::strstr(format, "HyperClockCache") == nullptr) {
+ // Not a match
+ return;
+ }
+ // static StderrLogger debug;
+ // debug.Logv(log_level, format, ap);
+ if (log_level == InfoLogLevel::INFO_LEVEL) {
+ ++info_count_;
+ } else if (log_level == InfoLogLevel::WARN_LEVEL) {
+ ++warn_count_;
+ } else if (log_level == InfoLogLevel::ERROR_LEVEL) {
+ ++error_count_;
+ }
+ }
+
+ std::array<int, 3> PopCounts() {
+ std::array<int, 3> rv{{info_count_, warn_count_, error_count_}};
+ info_count_ = warn_count_ = error_count_ = 0;
+ return rv;
+ }
+
+ private:
+ int info_count_{};
+ int warn_count_{};
+ int error_count_{};
+};
+
+} // namespace
+
+TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) {
+ size_t capacity = 1024 * 1024;
+ size_t value_size_est = 8 * 1024;
+ HyperClockCacheOptions hcc_opts{capacity, value_size_est};
+ hcc_opts.num_shard_bits = 2; // 4 shards
+ hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+ std::shared_ptr<Cache> cache = hcc_opts.MakeSharedCache();
+ std::shared_ptr<CountingLogger> logger = std::make_shared<CountingLogger>();
+
+ auto table_options = GetTableOptions();
+ auto options = GetOptions(table_options);
+ table_options.block_cache = cache;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.info_log = logger;
+ // Going to sample more directly
+ options.stats_dump_period_sec = 0;
+ Reopen(options);
+
+ std::vector<CacheHandleGuard<void>> handles;
+
+ // Clear anything from DB startup
+ logger->PopCounts();
+
+ // Fill cache based on expected size and check that when we
+ // don't report anything relevant in periodic stats dump
+ DummyFillCache(*cache, value_size_est, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ // Same, within reasonable bounds
+ DummyFillCache(*cache, value_size_est - value_size_est / 4, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ DummyFillCache(*cache, value_size_est + value_size_est / 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+ // Estimate too high (value size too low) eventually reports ERROR
+ DummyFillCache(*cache, value_size_est / 2, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+ DummyFillCache(*cache, value_size_est / 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 1}}));
+
+ // Estimate too low (value size too high) starts with INFO
+ // and is only WARNING in the worst case
+ DummyFillCache(*cache, value_size_est * 2, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{1, 0, 0}}));
+
+ DummyFillCache(*cache, value_size_est * 3, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+ DummyFillCache(*cache, value_size_est * 20, handles);
+ dbfull()->DumpStats();
+ EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+}
+
#endif // ROCKSDB_LITE
class DBBlockCacheKeyTest
return;
}
+ // Also probe block cache(s) for problems, dump to info log
+ UnorderedSet<Cache*> probed_caches;
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
{
InstrumentedMutexLock l(&mutex_);
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
- if (cfd->initialized()) {
- // Release DB mutex for gathering cache entry stats. Pass over all
- // column families for this first so that other stats are dumped
- // near-atomically.
- InstrumentedMutexUnlock u(&mutex_);
- cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+ if (!cfd->initialized()) {
+ continue;
+ }
+
+ // Release DB mutex for gathering cache entry stats. Pass over all
+ // column families for this first so that other stats are dumped
+ // near-atomically.
+ InstrumentedMutexUnlock u(&mutex_);
+ cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+
+ // Probe block cache for problems (if not already via another CF)
+ if (immutable_db_options_.info_log) {
+ auto* table_factory = cfd->ioptions()->table_factory.get();
+ assert(table_factory != nullptr);
+ Cache* cache =
+ table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+ if (cache && probed_caches.insert(cache).second) {
+ cache->ReportProblems(immutable_db_options_.info_log);
+ }
}
}
class Cache;
struct ConfigOptions;
+class Logger;
class SecondaryCache;
// Classifications of block cache entries.
virtual std::string GetPrintableOptions() const { return ""; }
+ // Check for any warnings or errors in the operation of the cache and
+ // report them to the logger. This is intended only to be called
+ // periodically so does not need to be very efficient. (Obscure calling
+ // conventions for Logger inherited from env.h)
+ virtual void ReportProblems(
+ const std::shared_ptr<Logger>& /*info_log*/) const {}
+
MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
// EXPERIMENTAL
#define ROCKS_LOG_HEADER(LGR, FMT, ...) \
ROCKSDB_NAMESPACE::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__)
-#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \
- ROCKSDB_NAMESPACE::Log(InfoLogLevel::DEBUG_LEVEL, LGR, \
- ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+#define ROCKS_LOG_AT_LEVEL(LGR, LVL, FMT, ...) \
+ ROCKSDB_NAMESPACE::Log((LVL), (LGR), ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
-#define ROCKS_LOG_INFO(LGR, FMT, ...) \
- ROCKSDB_NAMESPACE::Log(InfoLogLevel::INFO_LEVEL, LGR, \
- ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
- RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \
+ ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::DEBUG_LEVEL, FMT, ##__VA_ARGS__)
-#define ROCKS_LOG_WARN(LGR, FMT, ...) \
- ROCKSDB_NAMESPACE::Log(InfoLogLevel::WARN_LEVEL, LGR, \
- ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
- RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+#define ROCKS_LOG_INFO(LGR, FMT, ...) \
+ ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::INFO_LEVEL, FMT, ##__VA_ARGS__)
-#define ROCKS_LOG_ERROR(LGR, FMT, ...) \
- ROCKSDB_NAMESPACE::Log(InfoLogLevel::ERROR_LEVEL, LGR, \
- ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
- RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+#define ROCKS_LOG_WARN(LGR, FMT, ...) \
+ ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::WARN_LEVEL, FMT, ##__VA_ARGS__)
-#define ROCKS_LOG_FATAL(LGR, FMT, ...) \
- ROCKSDB_NAMESPACE::Log(InfoLogLevel::FATAL_LEVEL, LGR, \
- ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
- RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+#define ROCKS_LOG_ERROR(LGR, FMT, ...) \
+ ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::ERROR_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_FATAL(LGR, FMT, ...) \
+ ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::FATAL_LEVEL, FMT, ##__VA_ARGS__)
#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \
ROCKSDB_NAMESPACE::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \