}
if (fnode.ino != 1) {
vselector->sub_usage(f->vselector_hint, f->fnode);
- }
+ vselector->add_usage(f->vselector_hint, fnode);
+ }
f->fnode = fnode;
- if (fnode.ino != 1) {
- vselector->add_usage(f->vselector_hint, f->fnode);
- }
if (fnode.ino > ino_last) {
ino_last = fnode.ino;
// write entire file
auto l = _allocate(dev_target, bl.length(), 0,
- &file_ref->fnode, 0, false);
+ &file_ref->fnode, nullptr, 0, false);
if (l < 0) {
derr << __func__ << " unable to allocate len 0x" << std::hex
<< bl.length() << std::dec << " from " << (int)dev_target
// write entire file
auto l = _allocate(dev_target, bl.length(), 0,
- &file_ref->fnode, 0, false);
+ &file_ref->fnode, nullptr, 0, false);
if (l < 0) {
derr << __func__ << " unable to allocate len 0x" << std::hex
<< bl.length() << std::dec << " from " << (int)dev_target
auto t0 = mono_clock::now();
File *log_file = log.writer->file.get();
- bluefs_fnode_t fnode_tail;
// log.t.seq is always set to current live seq
ceph_assert(log.t.seq == log.seq_live);
// Capturing entire state. Dump anything that has been stored there.
dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
- int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
+ bluefs_fnode_t fnode_tail;
+ int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, nullptr, 0,
permit_dev_fallback);
ceph_assert(r == 0);
uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
- r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
+ r = _allocate(log_dev, starter_need, 0, &fnode_starter, nullptr, 0,
permit_dev_fallback);
ceph_assert(r == 0);
}
ll.release();
uint64_t allocated_before_extension = log.writer->file->fnode.get_allocated();
- vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
amount = round_up_to(amount, super.block_size);
int r = _allocate(
vselector->select_prefer_bdev(log.writer->file->vselector_hint),
amount,
0,
- &log.writer->file->fnode);
+ &log.writer->file->fnode,
+ [&](const bluefs_extent_t& e) {
+ vselector->add_usage(log.writer->file->vselector_hint, e);
+ });
ceph_assert(r == 0);
dout(10) << "extended log by 0x" << std::hex << amount << " bytes " << dendl;
- vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
bluefs_transaction_t log_extend_transaction;
log_extend_transaction.seq = log.t.seq;
dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
<< " 0x" << offset << "~" << length << std::dec
- << " to " << h->file->fnode << dendl;
+ << " to " << h->file->fnode
+ << " hint " << h->file->vselector_hint
+ << dendl;
if (h->file->deleted) {
dout(10) << __func__ << " deleted, no-op" << dendl;
return 0;
ceph_assert(offset <= h->file->fnode.size);
uint64_t allocated = h->file->fnode.get_allocated();
- vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
// do not bother to dirty the file if we are overwriting
// previously allocated extents.
if (allocated < offset + length) {
int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
offset + length - allocated,
0,
- &h->file->fnode);
+ &h->file->fnode,
+ [&](const bluefs_extent_t& e) {
+ vselector->add_usage(h->file->vselector_hint, e);
+ });
if (r < 0) {
derr << __func__ << " allocated: 0x" << std::hex << allocated
<< " offset: 0x" << offset << " length: 0x" << length << std::dec
<< dendl;
- vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
ceph_abort_msg("bluefs enospc");
return r;
}
h->file->is_dirty = true;
}
if (h->file->fnode.size < offset + length) {
+ vselector->add_usage(h->file->vselector_hint, offset + length - h->file->fnode.size);
h->file->fnode.size = offset + length;
h->file->is_dirty = true;
}
-
dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
int res = _flush_data(h, offset, length, buffered);
- vselector->add_usage(h->file->vselector_hint, h->file->fnode);
return res;
}
_flush_bdev(h);
std::lock_guard ll(log.lock);
- vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
+ vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
h->file->fnode.size = offset;
h->file->is_dirty = true;
- vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
log.t.op_file_update_inc(h->file->fnode);
return 0;
}
int BlueFS::_allocate(uint8_t id, uint64_t len,
uint64_t alloc_unit,
bluefs_fnode_t* node,
+ update_fn_t cb,
size_t alloc_attempts,
bool permit_dev_fallback)
{
len,
alloc_unit,
node,
+ cb,
alloc_attempts,
permit_dev_fallback);
} else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
len,
0, // back to default alloc unit
node,
+ cb,
alloc_attempts,
permit_dev_fallback);
} else {
}
for (auto& p : extents) {
- node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+ bluefs_extent_t e(id, p.offset, p.length);
+ node->append_extent(e);
+ if (cb) {
+ cb(e);
+ }
}
-
return 0;
}
if (off + len > allocated) {
uint64_t want = off + len - allocated;
- vselector->sub_usage(f->vselector_hint, f->fnode);
int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
want,
0,
- &f->fnode);
- vselector->add_usage(f->vselector_hint, f->fnode);
+ &f->fnode,
+ [&](const bluefs_extent_t& e) {
+ vselector->add_usage(f->vselector_hint, e);
+ });
if (r < 0)
return r;
_maybe_check_vselector_LNF();
FileRef file;
bool create = false;
- bool truncate = false;
mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
{
std::lock_guard ll(log.lock);
}
file = ceph::make_ref<File>();
file->fnode.ino = ++ino_last;
+ file->vselector_hint = vselector->get_hint_by_dir(dirname);
nodes.file_map[ino_last] = file;
dir->file_map[string{filename}] = file;
++file->refs;
create = true;
+ vselector->add_usage(file->vselector_hint, file->fnode.size, true); // update file count
logger->set(l_bluefs_num_files, nodes.file_map.size());
} else {
// overwrite existing file?
<< " already exists, truncate + overwrite" << dendl;
vselector->sub_usage(file->vselector_hint, file->fnode);
file->fnode.size = 0;
+ vselector->add_usage(file->vselector_hint, file->fnode.size, true); // restore file count
pending_release_extents.swap(file->fnode.extents);
- truncate = true;
file->fnode.clear_extents();
}
ceph_assert(file->fnode.ino > 1);
file->fnode.mtime = ceph_clock_now();
- file->vselector_hint = vselector->get_hint_by_dir(dirname);
- if (create || truncate) {
- vselector->add_usage(file->vselector_hint, file->fnode); // update file count
- }
-
dout(20) << __func__ << " mapping " << dirname << "/" << filename
<< " vsel_hint " << file->vselector_hint
<< dendl;
virtual void* get_hint_for_log() const = 0;
virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
- virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
- virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
- virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
- virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
+ void add_usage(void* hint, const bluefs_fnode_t& fnode) {
+ for (auto& e : fnode.extents) {
+ add_usage(hint, e);
+ }
+ add_usage(hint, fnode.size, true);
+ }
+ void sub_usage(void* hint, const bluefs_fnode_t& fnode) {
+ for (auto& e : fnode.extents) {
+ sub_usage(hint, e);
+ }
+ sub_usage(hint, fnode.size, true);
+ }
+ virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0;
+ virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0;
+ virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
+ virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
virtual uint8_t select_prefer_bdev(void* hint) = 0;
virtual void get_paths(const std::string& base, paths& res) const = 0;
virtual void dump(std::ostream& sout) = 0;
return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
}
const char* get_device_name(unsigned id);
+
+ typedef std::function<void(const bluefs_extent_t)> update_fn_t;
int _allocate(uint8_t bdev, uint64_t len,
uint64_t alloc_unit,
bluefs_fnode_t* node,
+ update_fn_t cb = nullptr,
size_t alloc_attempts = 0,
bool permit_dev_fallback = true);
void* get_hint_for_log() const override;
void* get_hint_by_dir(std::string_view dirname) const override;
- void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ void add_usage(void* hint, const bluefs_extent_t& extent) override {
// do nothing
return;
}
- void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ void sub_usage(void* hint, const bluefs_extent_t& extent) override {
// do nothing
return;
}
- void add_usage(void* hint, uint64_t fsize) override {
+ void add_usage(void*, uint64_t, bool) override {
// do nothing
return;
}
- void sub_usage(void* hint, uint64_t fsize) override {
+ void sub_usage(void*, uint64_t, bool) override {
// do nothing
return;
}
bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
- 1024 * 1024 * 1024, //FIXME: set expected l0 size here
+ rocks_opts.write_buffer_size * rocks_opts.max_write_buffer_number,
rocks_opts.max_bytes_for_level_base,
rocks_opts.max_bytes_for_level_multiplier,
reserved_factor,
auto max_x = per_level_per_dev_usage.get_max_x();
auto max_y = per_level_per_dev_usage.get_max_y();
- sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
+ sout << "RocksDBBlueFSVolumeSelector " << std::endl;
+ sout << ">>Settings<<"
+ << " extra=" << byte_u_t(db_avail4slow)
+ << ", l0_size=" << byte_u_t(level0_size)
+ << ", l_base=" << byte_u_t(level_base)
+ << ", l_multi=" << byte_u_t(level_multiplier)
+ << std::endl;
constexpr std::array<const char*, 8> names{ {
"DEV/LEV",
"WAL",
uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
uint64_t db_avail4slow = 0;
+ uint64_t level0_size = 0;
+ uint64_t level_base = 0;
+ uint64_t level_multiplier = 0;
enum {
OLD_POLICY,
USE_SOME_EXTRA
if (!new_pol) {
return;
}
-
// Calculating how much extra space is available at DB volume.
// Depending on the presence of explicit reserved size specification it might be either
// * DB volume size - reserved
// or
// * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
if (!reserved) {
+ level0_size = _level0_size;
+ level_base = _level_base;
+ level_multiplier = _level_multiplier;
uint64_t prev_levels = _level0_size;
uint64_t cur_level = _level_base;
- uint64_t cur_threshold = 0;
+ uint64_t cur_threshold = prev_levels + cur_level;
do {
- uint64_t next_level = cur_level * _level_multiplier;
- uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+ uint64_t next_level = cur_level * _level_multiplier;
+ uint64_t next_threshold = prev_levels + cur_level + next_level;
if (_db_total <= next_threshold) {
- db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+ cur_threshold *= reserved_factor;
+ db_avail4slow = cur_threshold < _db_total ? _db_total - cur_threshold : 0;
break;
} else {
prev_levels += cur_level;
}
} while (true);
} else {
- db_avail4slow = _db_total - reserved;
+ db_avail4slow = reserved < _db_total ? _db_total - reserved : 0;
}
}
}
void* get_hint_by_dir(std::string_view dirname) const override;
- void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ void add_usage(void* hint, const bluefs_extent_t& extent) override {
if (hint == nullptr)
return;
size_t pos = (size_t)hint - LEVEL_FIRST;
- for (auto& p : fnode.extents) {
- auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
- auto& max = per_level_per_dev_max.at(p.bdev, pos);
- uint64_t v = cur.fetch_add(p.length) + p.length;
- while (v > max) {
- max.exchange(v);
- }
- {
- //update per-device totals
- auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
- auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
- uint64_t v = cur.fetch_add(p.length) + p.length;
- while (v > max) {
- max.exchange(v);
- }
- }
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+ auto& max = per_level_per_dev_max.at(extent.bdev, pos);
+ uint64_t v = cur.fetch_add(extent.length) + extent.length;
+ while (v > max) {
+ max.exchange(v);
}
{
- //update per-level actual totals
- auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
- auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
- uint64_t v = cur.fetch_add(fnode.size) + fnode.size;
+ //update per-device totals
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ uint64_t v = cur.fetch_add(extent.length) + extent.length;
while (v > max) {
max.exchange(v);
}
}
- ++per_level_files[pos];
- ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
}
- void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+ void sub_usage(void* hint, const bluefs_extent_t& extent) override {
if (hint == nullptr)
return;
size_t pos = (size_t)hint - LEVEL_FIRST;
- for (auto& p : fnode.extents) {
- auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
- ceph_assert(cur >= p.length);
- cur -= p.length;
-
- //update per-device totals
- auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
- ceph_assert(cur2 >= p.length);
- cur2 -= p.length;
- }
- //update per-level actual totals
- auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
- ceph_assert(cur >= fnode.size);
- cur -= fnode.size;
- ceph_assert(per_level_files[pos] > 0);
- --per_level_files[pos];
- ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
- --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+ auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+ ceph_assert(cur >= extent.length);
+ cur -= extent.length;
+
+ //update per-device totals
+ auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+ ceph_assert(cur2 >= extent.length);
+ cur2 -= extent.length;
}
- void add_usage(void* hint, uint64_t size_more) override {
+ void add_usage(void* hint, uint64_t size_more, bool upd_files) override {
if (hint == nullptr)
return;
size_t pos = (size_t)hint - LEVEL_FIRST;
while (v > max) {
max.exchange(v);
}
+ if (upd_files) {
+ ++per_level_files[pos];
+ ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
+ }
}
- void sub_usage(void* hint, uint64_t size_less) override {
+ void sub_usage(void* hint, uint64_t size_less, bool upd_files) override {
if (hint == nullptr)
return;
size_t pos = (size_t)hint - LEVEL_FIRST;
auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
ceph_assert(cur >= size_less);
cur -= size_less;
+ if (upd_files) {
+ ceph_assert(per_level_files[pos] > 0);
+ --per_level_files[pos];
+ ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+ --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+ }
}
uint8_t select_prefer_bdev(void* h) override;