res.emplace_back(base, 1); // size of the last db_path has no effect
}
+uint64_t RocksDBBlueFSVolumeSelector::get_effective_extra() const {
+ // considering statically available db space vs.
+ // - observed maximums on DB dev for DB/WAL/UNSORTED data
+ // - observed maximum spillovers
+
+ // max db usage we potentially observed
+ uint64_t max_db_use = 0;
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+ // this could go to db hence using it in the estimation
+ max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+ auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+ uint64_t avail = std::min(
+ db_avail4slow,
+ max_db_use < db_total ? db_total - max_db_use : 0);
+
+ return avail;
+}
+
uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
ceph_assert(h != nullptr);
uint64_t hint = reinterpret_cast<uint64_t>(h);
case LEVEL_SLOW:
res = BlueFS::BDEV_SLOW;
if (db_avail4slow > 0) {
- // considering statically available db space vs.
- // - observed maximums on DB dev for DB/WAL/UNSORTED data
- // - observed maximum spillovers
- uint64_t max_db_use = 0; // max db usage we potentially observed
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
- // this could go to db hence using it in the estimation
- max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
-
- auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
- uint64_t avail = std::min(
- db_avail4slow,
- max_db_use < db_total ? db_total - max_db_use : 0);
+ auto avail = get_effective_extra();
// considering current DB dev usage for SLOW data
if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
res = BlueFS::BDEV_DB;
auto max_y = per_level_per_dev_usage.get_max_y();
sout << "RocksDBBlueFSVolumeSelector " << std::endl;
- sout << ">>Settings<<"
- << " extra=" << byte_u_t(db_avail4slow)
+ sout << ">>Parameters<<"
+ << " effective extra=" << byte_u_t(get_effective_extra())
+ << " max extra=" << byte_u_t(db_avail4slow)
<< ", extra level=" << extra_level
<< ", l0_size=" << byte_u_t(level0_size)
<< ", l_base=" << byte_u_t(level_base)
}
}
- uint64_t get_available_extra() const {
- return db_avail4slow;
- }
+ // Returns a static value (based on disk layout and store's settings)
+ // of the first RocksDB level which doesn't fully fit into "fast" volume.
+ // So with the selector's help data belonging to this level could be
+ // [partially] allocated at that volume if get_*_extra() methods below
+ // indicate non-zero values.
uint64_t get_extra_level() const {
return extra_level;
}
+ // Returns a static value (based on disk layout and store's settings)
+ // of the extra space at DB volume which
+ // this volume selector permits for using by data from "slow" levels.
+ // Takes both maximum historical observations and store's settings
+ // into account
+ uint64_t get_max_extra() const {
+ return db_avail4slow;
+ }
+ // Calculates the effective amount of extra space at "fast" volume which
+ // this volume selector permits for using by data from "slow" levels.
+ // Takes both the maximum historical observations and the value from
+ // get_max_extra() into account to get the final result.
+ uint64_t get_effective_extra() const;
void* get_hint_for_log() const override {
return reinterpret_cast<void*>(LEVEL_LOG);
}
bluefs_extent_t e;
ASSERT_EQ(4, selector.get_extra_level());
- ASSERT_EQ(30ull << 30, selector.get_available_extra()); // 168GB - 1GB (L0) - 1GB (L1) - 8GB (L2) - 2*64GB (L3)
+ ASSERT_EQ(30ull << 30, selector.get_max_extra()); // 168GB - 1GB (L0) - 1GB (L1) - 8GB (L2) - 2*64GB (L3)
+ ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // no history so we get max extra
ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev));
ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev));
e.length = 1ull * (1 << 30);
selector.add_usage((void*)db_bdev, e);
}
+
+ ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // still 30GB is available
+
ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev));
ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev));
ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev));
ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev));
- // 'Use' 30GB Slow level data at DB vol
- for (size_t i = 0; i < 30; i++) {
+ // 'Use' 20GB Slow level data at DB vol
+ for (size_t i = 0; i < 20; i++) {
e.bdev = 1; // DB dev
e.length = 1ull * (1 << 30);
ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev));
ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev));
ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev));
ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev));
+ ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev));
+
+ // 'Use' 10GB more Slow level data at DB vol
+ for (size_t i = 0; i < 10; i++) {
+ e.bdev = 1; // DB dev
+ e.length = 1ull * (1 << 30);
+ ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev));
+ selector.add_usage((void*)slow_bdev, e);
+ }
+ // now slow data to be targeted to slow dev
+ ASSERT_EQ(0, selector.select_prefer_bdev((void*)log_bdev));
+ ASSERT_EQ(0, selector.select_prefer_bdev((void*)wal_bdev));
+ ASSERT_EQ(1, selector.select_prefer_bdev((void*)db_bdev));
ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev));
// 'Unuse' 10GB DB level data at DB vol, slow data still wouldn't fit
ASSERT_EQ(1, selector.select_prefer_bdev((void*)slow_bdev));
selector.add_usage((void*)db_bdev, e);
}
+ ASSERT_EQ(10ull << 30, selector.get_effective_extra()); // reduced extra due to historic usage
+
+ // 'Use' 10GB more slw level data at DB vol
for (size_t i = 0; i < 10; i++) {
e.bdev = 1; // DB dev
e.length = 1ull * (1 << 30);
selector.add_usage((void*)slow_bdev, e);
}
ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev));
+ ASSERT_EQ(10ull << 30, selector.get_effective_extra()); // reduced extra due to historic usage
// 'Unuse' remaining 10GB Slow level data at DB vol
for (size_t i = 0; i < 10; i++) {
selector.add_usage((void*)db_bdev, e);
}
ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev));
+ ASSERT_EQ(0, selector.get_effective_extra()); // historic usage results in no extra
// 'Unuse' 50GB DB level data, thi s wouldn't let slow data use DB volume anyway
// due to updated historic maximum
selector.sub_usage((void*)db_bdev, e);
}
ASSERT_EQ(2, selector.select_prefer_bdev((void*)slow_bdev));
+ ASSERT_EQ(0, selector.get_effective_extra()); // historic usage results in no extra
{
std::stringstream ss;
ASSERT_EQ(0, selector.get_max_db_total());
selector.dump(ss);
std::cout << ss.str() << std::endl;
+ ASSERT_EQ(30ull << 30, selector.get_effective_extra()); // no history any more
}
+
}
int main(int argc, char **argv) {