f->dump_string("device", bluefs->get_device_name(dev));
ceph_assert(bluefs->alloc[dev]);
auto total = bluefs->get_total(dev);
- auto free = bluefs->alloc[dev]->get_free();
- auto used = bluefs->alloc[dev] == bluefs->shared_bdev_alloc ?
- bluefs->shared_bdev_used.load() :
- total - free;
+ auto free = bluefs->get_free(dev);
+ auto used = bluefs->get_used(dev);
f->dump_int("total", total);
f->dump_int("free", free);
}
};
-BlueFS::BlueFS(CephContext* cct)
+BlueFS::BlueFS(CephContext* cct,
+ bluefs_shared_alloc_context_t* _shared_alloc)
: cct(cct),
bdev(MAX_BDEV),
ioc(MAX_BDEV),
- block_all(MAX_BDEV),
+ block_reserved(MAX_BDEV),
alloc(MAX_BDEV),
alloc_size(MAX_BDEV, 0),
- pending_release(MAX_BDEV)
+ pending_release(MAX_BDEV),
+ shared_alloc(_shared_alloc)
{
discard_cb[BDEV_WAL] = wal_discard_cb;
discard_cb[BDEV_DB] = db_discard_cb;
logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
if (alloc[BDEV_WAL]) {
- logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
- logger->set(l_bluefs_wal_used_bytes,
- block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
+ logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
+ logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
}
if (alloc[BDEV_DB]) {
- logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
- uint64_t used = alloc[BDEV_DB] == shared_bdev_alloc ?
- shared_bdev_used.load() :
- block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free();
- logger->set(l_bluefs_db_used_bytes, used);
+ logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
+ logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
}
if (alloc[BDEV_SLOW]) {
- logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
- uint64_t used = alloc[BDEV_SLOW] == shared_bdev_alloc ?
- shared_bdev_used.load() :
- block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free();
- logger->set(l_bluefs_slow_used_bytes, used);
+ logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
+ logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
}
}
int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
- bool shared_with_bluestore,
- Allocator* _shared_bdev_alloc)
+ uint64_t reserved,
+ bool shared_with_bluestore)
{
- dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
+ dout(10) << __func__ << " bdev " << id << " path " << path << " "
+ << reserved << dendl;
ceph_assert(id < bdev.size());
ceph_assert(bdev[id] == NULL);
BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
discard_cb[id], static_cast<void*>(this));
+ block_reserved[id] = reserved;
if (shared_with_bluestore) {
b->set_no_exclusive_lock();
}
<< " size " << byte_u_t(b->get_size()) << dendl;
bdev[id] = b;
ioc[id] = new IOContext(cct, NULL);
- if (_shared_bdev_alloc) {
- ceph_assert(shared_bdev_alloc == nullptr);
- alloc[id] = shared_bdev_alloc = _shared_bdev_alloc;
- need_shared_alloc_init = true;
+ if (shared_with_bluestore) {
+ ceph_assert(shared_alloc); // to be set in ctor before
+ alloc[id] = shared_alloc->a;
+ shared_alloc_id = id;
}
return 0;
}
return bdev[id]->supported_bdev_label();
}
-uint64_t BlueFS::get_block_device_size(unsigned id)
+uint64_t BlueFS::get_block_device_size(unsigned id) const
{
if (id < bdev.size() && bdev[id])
return bdev[id]->get_size();
return 0;
}
-void BlueFS::_add_block_extent(bool create, unsigned id, uint64_t offset,
- uint64_t length, bool skip)
-{
- dout(1) << __func__ << " bdev " << id
- << " create " << create
- << " 0x" << std::hex << offset << "~" << length << std::dec
- << " skip " << skip
- << dendl;
-
- ceph_assert(id < bdev.size());
- ceph_assert(bdev[id]);
- ceph_assert(bdev[id]->get_size() >= offset + length);
- block_all[id].insert(offset, length);
-
- if (!create) {
- ceph_assert(id < alloc.size());
- ceph_assert(alloc[id]);
- if (!skip)
- log_t.op_alloc_add(id, offset, length);
- if (alloc[id] != shared_bdev_alloc) {
- alloc[id]->init_add_free(offset, length);
- }
- }
-
- dout(10) << __func__ << " done" << dendl;
-}
-
void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
{
dout(10) << __func__ << " bdev " << id << dendl;
ceph_assert(alloc[id]);
alloc[id]->release(to_release);
- if (alloc[id] == shared_bdev_alloc) {
- shared_bdev_used -= to_release.size();
+ if (is_shared_alloc(id)) {
+ shared_alloc->bluefs_used -= to_release.size();
}
}
std::lock_guard l(lock);
uint64_t used = 0;
for (unsigned id = 0; id < MAX_BDEV; ++id) {
- if (alloc[id]) {
- if (alloc[id] != shared_bdev_alloc) {
- used += block_all[id].size() - alloc[id]->get_free();
- } else {
- used += shared_bdev_used;
- }
- }
+ used += _get_used(id);
}
return used;
}
-uint64_t BlueFS::get_used(unsigned id)
+uint64_t BlueFS::_get_used(unsigned id) const
{
- ceph_assert(id < alloc.size());
- ceph_assert(alloc[id]);
- std::lock_guard l(lock);
uint64_t used = 0;
- if (alloc[id] != shared_bdev_alloc) {
- used = block_all[id].size() - alloc[id]->get_free();
+ if (!alloc[id])
+ return 0;
+
+ if (is_shared_alloc(id)) {
+ used = shared_alloc->bluefs_used;
} else {
- used += shared_bdev_used;
+ used = _get_total(id) - alloc[id]->get_free();
}
return used;
}
+uint64_t BlueFS::get_used(unsigned id)
+{
+ ceph_assert(id < alloc.size());
+ ceph_assert(alloc[id]);
+ std::lock_guard l(lock);
+ return _get_used(id);
+}
+
+uint64_t BlueFS::_get_total(unsigned id) const
+{
+ ceph_assert(id < bdev.size());
+ ceph_assert(id < block_reserved.size());
+ return get_block_device_size(id) - block_reserved[id];
+}
+
uint64_t BlueFS::get_total(unsigned id)
{
std::lock_guard l(lock);
- ceph_assert(id < block_all.size());
- return block_all[id].size();
+ return _get_total(id);
}
uint64_t BlueFS::get_free(unsigned id)
if (!bdev[i]) {
continue;
}
- auto owned = get_total(i);
+ auto total = get_total(i);
auto free = get_free(i);
- out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
- << " : own 0x" << block_all[i]
- << " = 0x" << owned
- << " : using 0x" << owned - free
- << std::dec << "(" << byte_u_t(owned - free) << ")";
+ out << i << " : device size 0x" << std::hex << total
+ << " : using 0x" << total - free
+ << std::dec << "(" << byte_u_t(total - free) << ")";
out << "\n";
}
}
-void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
-{
- std::lock_guard l(lock);
- usage->resize(bdev.size());
- for (unsigned id = 0; id < bdev.size(); ++id) {
- if (!bdev[id]) {
- (*usage)[id] = make_pair(0, 0);
- continue;
- }
- (*usage)[id].first = alloc[id]->get_free();
- (*usage)[id].second = block_all[id].size();
- uint64_t used =
- (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
- dout(10) << __func__ << " bdev " << id
- << " free " << (*usage)[id].first
- << " (" << byte_u_t((*usage)[id].first) << ")"
- << " / " << (*usage)[id].second
- << " (" << byte_u_t((*usage)[id].second) << ")"
- << ", used " << used << "%"
- << dendl;
- }
-}
-
int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
{
std::lock_guard l(lock);
dout(10) << __func__ << " bdev " << id << dendl;
- if (id >= block_all.size())
+ if (id >= alloc.size())
return -EINVAL;
- if (alloc[id] && alloc[id] == shared_bdev_alloc) {
- for (auto& p : file_map) {
- for (auto& q : p.second->fnode.extents) {
- if (alloc[q.bdev] == shared_bdev_alloc) {
- extents->insert(q.offset, q.length);
- }
+ for (auto& p : file_map) {
+ for (auto& q : p.second->fnode.extents) {
+ if (q.bdev == id && alloc[q.bdev] == shared_alloc->a) {
+ extents->insert(q.offset, q.length);
}
}
- } else {
- *extents = block_all[id];
}
return 0;
}
// initial txn
log_t.op_init();
- for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
- interval_set<uint64_t>& p = block_all[bdev];
- if (p.empty())
- continue;
- for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
- << std::hex << q.get_start() << "~" << q.get_len() << std::dec
- << dendl;
- log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
- }
- }
_flush_and_sync_log(l);
// write supers
super = bluefs_super_t();
_close_writer(log_writer);
log_writer = NULL;
- block_all.clear();
vselector.reset(nullptr);
_stop_alloc();
_shutdown_logger();
- need_shared_alloc_init = false;
+ if (shared_alloc) {
+ ceph_assert(shared_alloc->need_init);
+ shared_alloc->need_init = false;
+ }
dout(10) << __func__ << " success" << dendl;
return 0;
void BlueFS::_init_alloc()
{
dout(20) << __func__ << dendl;
- block_unused_too_granular.resize(MAX_BDEV);
if (bdev[BDEV_WAL]) {
alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
}
ceph_assert(bdev[id]->get_size());
ceph_assert(alloc_size[id]);
- if (alloc[id]) {
+ if (is_shared_alloc(id)) {
dout(1) << __func__ << " shared, id " << id
<< " alloc_size 0x" << std::hex << alloc_size[id]
<< " size 0x" << bdev[id]->get_size() << std::dec << dendl;
- shared_bdev_used = 0;
} else {
std::string name = "bluefs-";
const char* devnames[] = { "wal","db","slow" };
alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
bdev[id]->get_size(),
alloc_size[id], name);
- interval_set<uint64_t>& p = block_all[id];
- for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- alloc[id]->init_add_free(q.get_start(), q.get_len());
- }
+ alloc[id]->init_add_free(
+ block_reserved[id],
+ _get_total(id) - block_reserved[id]);
}
}
}
}
for (size_t i = 0; i < alloc.size(); ++i) {
- if (alloc[i] && alloc[i] != shared_bdev_alloc) {
+ if (alloc[i] && !is_shared_alloc(i)) {
alloc[i]->shutdown();
delete alloc[i];
alloc[i] = nullptr;
}
}
- block_unused_too_granular.clear();
}
int BlueFS::mount()
{
dout(1) << __func__ << dendl;
+ bool shared_alloc_ready = shared_alloc && shared_alloc->a;
int r = _open_super();
if (r < 0) {
derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
}
- block_all.clear();
- block_all.resize(MAX_BDEV);
_init_alloc();
_init_logger();
for (auto& p : file_map) {
dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
for (auto& q : p.second->fnode.extents) {
- if (alloc[q.bdev] == shared_bdev_alloc) {
- if (need_shared_alloc_init) {
+ if (is_shared_alloc(q.bdev)) {
+ // we might have still uninitialized shared_alloc at this point
+ // just bypass initialization then
+ if (shared_alloc_ready && shared_alloc->need_init) {
+ ceph_assert(shared_alloc->a);
alloc[q.bdev]->init_rm_free(q.offset, q.length);
- shared_bdev_used += q.length;
+ shared_alloc->bluefs_used += q.length;
}
} else {
alloc[q.bdev]->init_rm_free(q.offset, q.length);
}
}
}
- need_shared_alloc_init = false;
- dout(1) << __func__ << " shared_bdev_used = " << shared_bdev_used << dendl;
+ if (shared_alloc_ready) {
+ shared_alloc->need_init = false;
+ }
+ dout(1) << __func__ << " shared_bdev_used = "
+ << (shared_alloc_ready ? (int64_t)shared_alloc->bluefs_used : -1)
+ << dendl;
// set up the log for future writes
log_writer = _create_writer(_get_file(1));
int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
size_t dev_count,
- boost::dynamic_bitset<uint64_t>* owned_blocks,
boost::dynamic_bitset<uint64_t>* used_blocks)
{
auto& fnode_extents = fnode.extents;
auto id = e.bdev;
bool fail = false;
ceph_assert(id < dev_count);
- apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (!bs.test(pos)) {
- fail = true;
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << int(id)
- << ": 0x" << std::hex << e.offset << "~" << e.length
- << std::dec
- << ": wasn't given but allocated for ino " << fnode.ino
- << dendl;
- return -EFAULT;
- }
apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
[&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
return 0;
}
-int BlueFS::_adjust_granularity(
- __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
-{
- const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
- auto oldo = *offset;
- auto oldl = *length;
- if (*offset & (alloc_size[id] - 1)) {
- *offset &= ~(alloc_size[id] - 1);
- *offset += alloc_size[id];
- if (*length > *offset - oldo) {
- if (alloc) {
- block_unused_too_granular[id].insert(oldo, *offset - oldo);
- } else {
- block_unused_too_granular[id].erase(oldo, *offset - oldo);
- }
- *length -= (*offset - oldo);
- } else {
- if (alloc) {
- block_unused_too_granular[id].insert(oldo, *length);
- } else {
- block_unused_too_granular[id].erase(oldo, *length);
- }
- *length = 0;
- }
- }
- if (*length & (alloc_size[id] - 1)) {
- *length &= ~(alloc_size[id] - 1);
- if (alloc) {
- block_unused_too_granular[id].insert(
- *offset + *length,
- oldo + oldl - *offset - *length);
- } else {
- block_unused_too_granular[id].erase(
- *offset + *length,
- oldo + oldl - *offset - *length);
- }
- }
- if (oldo != *offset || oldl != *length) {
- dout(10) << __func__ << " " << op << " "
- << (int)id << ":" << std::hex << oldo << "~" << oldl
- << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
- }
- return 0;
-}
-
int BlueFS::_verify_alloc_granularity(
__u8 id, uint64_t offset, uint64_t length, const char *op)
{
FileRef log_file;
log_file = _get_file(1);
- // sanity check
- for (auto& a : block_unused_too_granular) {
- ceph_assert(a.empty());
- }
-
log_file->fnode = super.log_fnode;
if (!noop) {
log_file->vselector_hint =
bool seen_recs = false;
boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
- boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
if (!noop) {
if (cct->_conf->bluefs_log_replay_check_allocations) {
for (size_t i = 0; i < MAX_BDEV; ++i) {
if (alloc_size[i] != 0 && bdev[i] != nullptr) {
used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
- owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
}
}
}
break;
case bluefs_transaction_t::OP_ALLOC_ADD:
+ // LEGACY, do nothing but read params
{
- __u8 id;
- uint64_t offset, length;
- decode(id, p);
- decode(offset, p);
- decode(length, p);
- dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
- << ": op_alloc_add " << " " << (int)id
- << ":0x" << std::hex << offset << "~" << length << std::dec
- << dendl;
- if (unlikely(to_stdout)) {
- std::cout << " 0x" << std::hex << pos << std::dec
- << ": op_alloc_add " << " " << (int)id
- << ":0x" << std::hex << offset << "~" << length << std::dec
- << std::endl;
- }
- if (!noop) {
- block_all[id].insert(offset, length);
- _adjust_granularity(id, &offset, &length, true);
- if (length &&
- alloc[id] != shared_bdev_alloc) {
- alloc[id]->init_add_free(offset, length);
- }
-
- if (cct->_conf->bluefs_log_replay_check_allocations) {
- bool fail = false;
- apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (bs.test(pos)) {
- fail = true;
- } else {
- bs.set(pos);
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << (int)id
- << ": 0x" << std::hex << offset << "~" << length
- << std::dec << ": already given" << dendl;
- return -EFAULT;
- }
- apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (bs.test(pos)) {
- fail = true;
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << int(id)
- << ": 0x" << std::hex << offset << "~" << length
- << std::dec << ": already in use" << dendl;
- return -EFAULT;
- }
- }
- }
- }
+ __u8 id;
+ uint64_t offset, length;
+ decode(id, p);
+ decode(offset, p);
+ decode(length, p);
+ }
break;
case bluefs_transaction_t::OP_ALLOC_RM:
+ // LEGACY, do nothing but read params
{
- __u8 id;
- uint64_t offset, length;
- decode(id, p);
- decode(offset, p);
- decode(length, p);
- dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
- << ": op_alloc_rm " << " " << (int)id
- << ":0x" << std::hex << offset << "~" << length << std::dec
- << dendl;
- if (unlikely(to_stdout)) {
- std::cout << " 0x" << std::hex << pos << std::dec
- << ": op_alloc_rm " << " " << (int)id
- << ":0x" << std::hex << offset << "~" << length << std::dec
- << std::endl;
- }
- if (!noop) {
- block_all[id].erase(offset, length);
- _adjust_granularity(id, &offset, &length, false);
- if (length && alloc[id] != shared_bdev_alloc) {
- alloc[id]->init_rm_free(offset, length);
- }
- if (cct->_conf->bluefs_log_replay_check_allocations) {
- bool fail = false;
- apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (!bs.test(pos)) {
- fail = true;
- } else {
- bs.reset(pos);
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << int(id)
- << ": 0x" << std::hex << offset << "~" << length
- << std::dec << ": wasn't given" << dendl;
- return -EFAULT;
- }
-
- apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (bs.test(pos)) {
- fail = true;
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << (int)id
- << ": 0x" << std::hex << offset << "~" << length
- << std::dec << ": still in use" << dendl;
- return -EFAULT;
- }
- }
- }
- }
- break;
+ __u8 id;
+ uint64_t offset, length;
+ decode(id, p);
+ decode(offset, p);
+ decode(length, p);
+ }
+ break;
case bluefs_transaction_t::OP_DIR_LINK:
{
if (first_log_check) {
first_log_check = false;
int r = _check_new_allocations(log_file->fnode,
- MAX_BDEV, owned_blocks, used_blocks);
+ MAX_BDEV, used_blocks);
if (r < 0) {
return r;
}
}
if (cct->_conf->bluefs_log_replay_check_allocations) {
int r = _check_new_allocations(f->fnode,
- MAX_BDEV, owned_blocks, used_blocks);
+ MAX_BDEV, used_blocks);
if (r < 0) {
return r;
}
for (auto e : fnode_extents) {
auto id = e.bdev;
bool fail = false;
- apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
- [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
- if (!bs.test(pos)) {
- fail = true;
- }
- }
- );
- if (fail) {
- derr << __func__ << " invalid extent " << int(id)
- << ": 0x" << std::hex << e.offset << "~" << e.length
- << std::dec
- << ": wasn't given but is allocated for removed ino " << ino
- << dendl;
- return -EFAULT;
- }
apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
[&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
if (!noop && first_log_check &&
cct->_conf->bluefs_log_replay_check_allocations) {
int r = _check_new_allocations(log_file->fnode,
- MAX_BDEV, owned_blocks, used_blocks);
+ MAX_BDEV, used_blocks);
if (r < 0) {
return r;
}
}
}
- for (unsigned id = 0; id < block_unused_too_granular.size(); ++id) {
- dout(10) << __func__ << " block_unused_too_granular " << id << ": "
- << block_unused_too_granular[id] << dendl;
- }
dout(10) << __func__ << " done" << dendl;
return 0;
}
PExtentVector to_release;
to_release.emplace_back(old_ext.offset, old_ext.length);
alloc[old_ext.bdev]->release(to_release);
- if (alloc[old_ext.bdev] == shared_bdev_alloc) {
- shared_bdev_used -= to_release.size();
+ if (is_shared_alloc(old_ext.bdev)) {
+ shared_alloc->bluefs_used -= to_release.size();
}
}
PExtentVector to_release;
to_release.emplace_back(old_ext.offset, old_ext.length);
alloc[old_ext.bdev]->release(to_release);
- if (alloc[old_ext.bdev] == shared_bdev_alloc) {
- shared_bdev_used -= to_release.size();
+ if (is_shared_alloc(old_ext.bdev)) {
+ shared_alloc->bluefs_used -= to_release.size();
}
}
int avg_file_size = 12;
uint64_t size = 4096 * 2;
size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
- for (auto& p : block_all)
- size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
size += dir_map.size() + (1 + avg_dir_size);
size += file_map.size() * (1 + avg_dir_size + avg_file_size);
return round_up_to(size, super.block_size);
dout(20) << __func__ << " op_init" << dendl;
t->op_init();
- for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
- interval_set<uint64_t>& p = block_all[bdev];
- for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- auto bdev_new = bdev;
- if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
- continue;
- }
- if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
- continue;
- }
- if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
- bdev_new = BDEV_DB;
- }
- if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
- bdev_new = BDEV_SLOW;
- }
- if (bdev == BDEV_NEWDB) {
- // REMOVE_DB xor RENAME_DB
- ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
- ceph_assert(!(flags & RENAME_SLOW2DB));
- bdev_new = BDEV_DB;
- }
- if (bdev == BDEV_NEWWAL) {
- ceph_assert(flags & REMOVE_WAL);
- bdev_new = BDEV_WAL;
- }
- dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
- << std::hex << q.get_start() << "~" << q.get_len() << std::dec
- << dendl;
- t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
- }
- }
for (auto& [ino, file_ref] : file_map) {
if (ino == 1)
continue;
}
}
alloc[i]->release(to_release[i]);
- if (alloc[i] == shared_bdev_alloc) {
- shared_bdev_used -= to_release[i].size();
+ if (is_shared_alloc(i)) {
+ shared_alloc->bluefs_used -= to_release[i].size();
}
}
}
for (unsigned i = 0; i < MAX_BDEV; i++) {
// alloc space from BDEV_SLOW is unexpected.
// So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
- if (bdev[i] && ((i != BDEV_SLOW) || (block_all[i].size() - alloc[i]->get_free()))) {
+ if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
bdev[i]->flush();
}
}
alloc[id]->dump();
return -ENOSPC;
}
- if (alloc[id] == shared_bdev_alloc) {
- shared_bdev_used += alloc_len;
+ if (is_shared_alloc(id)) {
+ shared_alloc->bluefs_used += alloc_len;
}
return 0;
<< std::dec << dendl;
return -ENOSPC;
} else {
- uint64_t total_allocated =
- block_all[id].size() - alloc[id]->get_free();
- if (max_bytes[id] < total_allocated) {
- logger->set(max_bytes_pcounters[id], total_allocated);
- max_bytes[id] = total_allocated;
+ uint64_t used = _get_used(id);
+ if (max_bytes[id] < used) {
+ logger->set(max_bytes_pcounters[id], used);
+ max_bytes[id] = used;
}
- if (alloc[id] == shared_bdev_alloc) {
- shared_bdev_used += alloc_len;
+ if (is_shared_alloc(id)) {
+ shared_alloc->bluefs_used += alloc_len;
}
}
return 0;
}
-void BlueFS::debug_inject_duplicate_gift(unsigned id,
- uint64_t offset,
- uint64_t len)
-{
- dout(0) << __func__ << dendl;
- if (id < alloc.size() && alloc[id]) {
- alloc[id]->init_add_free(offset, len);
- }
-}
-
// ===============================================
// OriginalVolumeSelector
virtual void get_paths(const std::string& base, paths& res) const = 0;
virtual void dump(std::ostream& sout) = 0;
};
-class BlueFS;
+
+struct bluefs_shared_alloc_context_t {
+ bool need_init = false;
+ Allocator* a = nullptr;
+
+ std::atomic<uint64_t> bluefs_used = 0;
+
+ void set(Allocator* _a) {
+ a = _a;
+ need_init = true;
+ bluefs_used = 0;
+ }
+ void reset() {
+ a = nullptr;
+ }
+};
class BlueFS {
public:
*/
std::vector<BlockDevice*> bdev; ///< block devices we can use
std::vector<IOContext*> ioc; ///< IOContexts for bdevs
- std::vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
+ std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
std::vector<Allocator*> alloc; ///< allocators for bdevs
std::vector<uint64_t> alloc_size; ///< alloc size for each device
std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
- std::vector<interval_set<uint64_t>> block_unused_too_granular;
+ //std::vector<interval_set<uint64_t>> block_unused_too_granular;
BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
std::unique_ptr<BlueFSVolumeSelector> vselector;
- bool need_shared_alloc_init = false;
- Allocator* shared_bdev_alloc = nullptr;
- std::atomic<uint64_t> shared_bdev_used = 0;
-
+ bluefs_shared_alloc_context_t* shared_alloc = nullptr;
+ unsigned shared_alloc_id = unsigned(-1);
+ inline bool is_shared_alloc(unsigned id) const {
+ return id == shared_alloc_id;
+ }
class SocketHook;
SocketHook* asok_hook = nullptr;
void _pad_bl(ceph::buffer::list& bl); ///< pad ceph::buffer::list to block size w/ zeros
+ uint64_t _get_used(unsigned id) const;
+ uint64_t _get_total(unsigned id) const;
+
+
FileRef _get_file(uint64_t ino);
void _drop_link(FileRef f);
int _write_super(int dev);
int _check_new_allocations(const bluefs_fnode_t& fnode,
size_t dev_count,
- boost::dynamic_bitset<uint64_t>* owned_blocks,
boost::dynamic_bitset<uint64_t>* used_blocks);
int _verify_alloc_granularity(
__u8 id, uint64_t offset, uint64_t length,
const char *op);
- int _adjust_granularity(
- __u8 id, uint64_t *offset, uint64_t *length, bool alloc);
int _replay(bool noop, bool to_stdout = false); ///< replay journal
FileWriter *_create_writer(FileRef f);
return 4096;
}
- void _add_block_extent(bool create, unsigned bdev, uint64_t offset,
- uint64_t len, bool skip=false);
-
public:
- BlueFS(CephContext* cct);
+ BlueFS(CephContext* cct, bluefs_shared_alloc_context_t* _shared_alloc);
~BlueFS();
// the super is always stored on bdev 0
uint64_t get_total(unsigned id);
uint64_t get_free(unsigned id);
uint64_t get_used(unsigned id);
- void get_usage(std::vector<pair<uint64_t,uint64_t>> *usage); // [<free,total> ...]
void dump_perf_counters(ceph::Formatter *f);
void dump_block_extents(std::ostream& out);
}
int add_block_device(unsigned bdev, const std::string& path, bool trim,
- bool shared_with_bluestore = false,
- Allocator* shared_bdev_alloc = nullptr);
+ uint64_t reserved,
+ bool shared_with_bluestore = false);
bool bdev_support_label(unsigned id);
- uint64_t get_block_device_size(unsigned bdev);
-
- /// gift more block space
- void add_block_extent(bool create, unsigned bdev, uint64_t offset, uint64_t len,
- bool skip=false) {
- std::unique_lock l(lock);
- _add_block_extent(create, bdev, offset, len, skip);
- int r = _flush_and_sync_log(l);
- ceph_assert(r == 0);
- }
+ uint64_t get_block_device_size(unsigned bdev) const;
// handler for discard event
void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
bufferlist* bl);
/// test purpose methods
- void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len);
const PerfCounters* get_perf_counters() const {
return logger;
}
void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
{
dout(10) << __func__ << dendl;
- ceph_assert(alloc);
- alloc->release(to_release);
+ ceph_assert(shared_alloc.a);
+ shared_alloc.a->release(to_release);
}
BlueStore::BlueStore(CephContext *cct, const string& path)
void BlueStore::_validate_bdev()
{
ceph_assert(bdev);
- ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that
uint64_t dev_size = bdev->get_size();
ceph_assert(dev_size > _get_ondisk_reserved());
}
int BlueStore::_open_alloc()
{
- ceph_assert(alloc == NULL);
+ ceph_assert(shared_alloc.a == NULL);
ceph_assert(bdev->get_size());
uint64_t alloc_size = min_alloc_size;
alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
}
- alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
+ shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
bdev->get_size(),
- alloc_size, "block");
+ alloc_size, "block"));
- if (!alloc) {
+ if (!shared_alloc.a) {
lderr(cct) << __func__ << " Allocator::unknown alloc type "
<< cct->_conf->bluestore_allocator
<< dendl;
}
if (bdev->is_smr()) {
- alloc->set_zone_states(fm->get_zone_states(db));
+ shared_alloc.a->set_zone_states(fm->get_zone_states(db));
}
uint64_t num = 0, bytes = 0;
fm->enumerate_reset();
uint64_t offset, length;
while (fm->enumerate_next(db, &offset, &length)) {
- alloc->init_add_free(offset, length);
+ shared_alloc.a->init_add_free(offset, length);
++num;
bytes += length;
}
dout(1) << __func__ << " loaded " << byte_u_t(bytes)
<< " in " << num << " extents"
- << " available " << byte_u_t(alloc->get_free())
+ << " available " << byte_u_t(shared_alloc.a->get_free())
<< dendl;
return 0;
ceph_assert(bdev);
bdev->discard_drain();
- ceph_assert(alloc);
- alloc->shutdown();
- delete alloc;
- alloc = NULL;
+ ceph_assert(shared_alloc.a);
+ shared_alloc.a->shutdown();
+ delete shared_alloc.a;
+ shared_alloc.reset();
}
int BlueStore::_open_fsid(bool create)
int BlueStore::_minimal_open_bluefs(bool create)
{
int r;
- bluefs = new BlueFS(cct);
+ bluefs = new BlueFS(cct, &shared_alloc);
string bfn;
struct stat st;
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(
BlueFS::BDEV_DB, bfn,
- create && cct->_conf->bdev_enable_discard);
+ create && cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
}
- if (create) {
- bluefs->add_block_extent(
- create,
- BlueFS::BDEV_DB,
- SUPER_RESERVED,
- bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
- }
bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
bluefs_layout.dedicated_db = true;
} else {
bfn = path + "/block";
// never trim here
r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
- true,
- alloc);
+ 0, // no need to provide valid 'reserved' for shared dev
+ true);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
- if (create) {
- auto reserved = _get_ondisk_reserved();
-
- bluefs->add_block_extent(
- create,
- bluefs_layout.shared_bdev,
- reserved,
- p2align(bdev->get_size(), min_alloc_size) - reserved);
- }
bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
- create && cct->_conf->bdev_enable_discard);
+ create && cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
}
}
- if (create) {
- bluefs->add_block_extent(
- create,
- BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
- bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
- BDEV_LABEL_BLOCK_SIZE);
- }
bluefs_layout.dedicated_wal = true;
} else {
r = 0;
return r;
}
-int BlueStore::_open_bluefs(bool create)
+int BlueStore::_open_bluefs(bool create, bool read_only)
{
int r = _minimal_open_bluefs(create);
if (r < 0) {
if (do_bluefs) {
// open in read-only first to read FM list and init allocator
// as they might be needed for some BlueFS procedures
+
r = _open_db(false, false, true);
if (r < 0)
return r;
return -EINVAL;
}
- r = _open_bluefs(create);
+ r = _open_bluefs(create, read_only);
if (r < 0) {
return r;
}
cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
if (dump_interval > 0 &&
next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
- alloc->dump();
+ shared_alloc.a->dump();
next_dump_on_bluefs_alloc_failure = ceph_clock_now();
next_dump_on_bluefs_alloc_failure += dump_interval;
}
return r;
alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
}
- alloc = Allocator::create(cct, cct->_conf->bluestore_allocator,
+ shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
bdev->get_size(),
- alloc_size, "block");
- if (!alloc) {
+ alloc_size, "block"));
+ if (!shared_alloc.a) {
r = -EINVAL;
goto out_close_bdev;
}
reserved = _get_ondisk_reserved();
- alloc->init_add_free(reserved,
+ shared_alloc.a->init_add_free(reserved,
p2align(bdev->get_size(), min_alloc_size) - reserved);
r = _open_db(true);
out_close_db:
_close_db(false);
out_close_bdev:
- delete alloc;
- alloc = nullptr;
+ delete shared_alloc.a;
+ shared_alloc.reset();
_close_bdev();
out_close_fsid:
_close_fsid();
r = _mount_for_bluefs();
- int reserved = 0;
if (id == BlueFS::BDEV_NEWWAL) {
string p = path + "/block.wal";
r = _setup_block_symlink_or_file("block.wal", dev_path,
ceph_assert(r == 0);
r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
- cct->_conf->bdev_enable_discard);
+ cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
ceph_assert(r == 0);
if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
ceph_assert(r == 0);
}
- reserved = BDEV_LABEL_BLOCK_SIZE;
bluefs_layout.dedicated_wal = true;
} else if (id == BlueFS::BDEV_NEWDB) {
string p = path + "/block.db";
ceph_assert(r == 0);
r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
- cct->_conf->bdev_enable_discard);
+ cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
ceph_assert(r == 0);
if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
true);
ceph_assert(r == 0);
}
- reserved = SUPER_RESERVED;
bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
bluefs_layout.dedicated_db = true;
}
bluefs->umount();
bluefs->mount();
- bluefs->add_block_extent(
- false,
- id,
- reserved,
- bluefs->get_block_device_size(id) - reserved, true);
-
r = bluefs->prepare_new_device(id, bluefs_layout);
ceph_assert(r == 0);
r = _mount_for_bluefs();
- int reserved = 0;
string link_db;
string link_wal;
if (devs_source.count(BlueFS::BDEV_DB) &&
bluefs_layout.dedicated_wal = true;
r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
- cct->_conf->bdev_enable_discard);
+ cct->_conf->bdev_enable_discard,
+ BDEV_LABEL_BLOCK_SIZE);
ceph_assert(r == 0);
if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
true);
ceph_assert(r == 0);
}
- reserved = BDEV_LABEL_BLOCK_SIZE;
} else if (id == BlueFS::BDEV_NEWDB) {
target_name = "block.db";
target_size = cct->_conf->bluestore_block_db_size;
bluefs_layout.dedicated_db = true;
r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
- cct->_conf->bdev_enable_discard);
+ cct->_conf->bdev_enable_discard,
+ SUPER_RESERVED);
ceph_assert(r == 0);
if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
true);
ceph_assert(r == 0);
}
- reserved = SUPER_RESERVED;
}
bluefs->umount();
bluefs->mount();
- bluefs->add_block_extent(
- false,
- id, reserved, bluefs->get_block_device_size(id) - reserved);
-
r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
if (r < 0) {
continue;
}
- interval_set<uint64_t> before;
- bluefs->get_block_extents(devid, &before);
- ceph_assert(!before.empty());
- uint64_t end = before.range_end();
- if (end < size) {
- out << devid
- <<" : expanding " << " from 0x" << std::hex
- << end << " to 0x" << size << std::dec << std::endl;
- bluefs->add_block_extent(false, devid, end, size-end);
- string p = get_device_path(devid);
- const char* path = p.c_str();
- if (path == nullptr) {
- derr << devid
- <<": can't find device path " << dendl;
- continue;
- }
- if (bluefs->bdev_support_label(devid)) {
- if (_set_bdev_label_size(p, size) >= 0) {
- out << devid
- << " : size label updated to " << size
- << std::endl;
- }
+ out << devid
+ <<" : expanding " << " to 0x" << size << std::dec << std::endl;
+ string p = get_device_path(devid);
+ const char* path = p.c_str();
+ if (path == nullptr) {
+ derr << devid
+ <<": can't find device path " << dendl;
+ continue;
+ }
+ if (bluefs->bdev_support_label(devid)) {
+ if (_set_bdev_label_size(p, size) >= 0) {
+ out << devid
+ << " : size label updated to " << size
+ << std::endl;
}
}
}
out << bluefs_layout.shared_bdev
<< " : expanding " << " from 0x" << std::hex
<< size0 << " to 0x" << size << std::dec << std::endl;
- bluefs->add_block_extent(false,
- bluefs_layout.shared_bdev, size0, size - size0);
_write_out_fm_meta(size);
if (bdev->supported_bdev_label()) {
if (_set_bdev_label_size(path, size) >= 0) {
continue;
}
PExtentVector exts;
- int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
- 0, 0, &exts);
+ int64_t alloc_len =
+ shared_alloc.a->allocate(e->length, min_alloc_size,
+ 0, 0, &exts);
if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
derr << __func__
<< " failed to allocate 0x" << std::hex << e->length
<< " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
<< " min_alloc_size 0x" << min_alloc_size
- << " available 0x " << alloc->get_free()
+ << " available 0x " << shared_alloc.a->get_free()
<< std::dec << dendl;
if (alloc_len > 0) {
- alloc->release(exts);
+ shared_alloc.a->release(exts);
}
bypass_rest = true;
break;
<< "~" << it.get_len() << std::dec << dendl;
fm->release(it.get_start(), it.get_len(), txn);
}
- alloc->release(to_release);
+ shared_alloc.a->release(to_release);
to_release.clear();
} // if (it) {
} //if (repair && repairer.preprocess_misreference()) {
txn = db->get_transaction();
PExtentVector exts;
- int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+ int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
min_alloc_size * 256, 0, &exts);
ceph_assert(alloc_len >= (int64_t)len);
for (auto& p : exts) {
db->estimate_prefix_size(PREFIX_OMAP, string()) +
db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string());
- uint64_t bfree = alloc->get_free();
+ uint64_t bfree = shared_alloc.a->get_free();
if (bluefs) {
buf->internally_reserved = 0;
// write helpers
uint64_t BlueStore::_get_ondisk_reserved() const {
+ ceph_assert(min_alloc_size);
return round_up_to(
std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
}
}
dout(10) << __func__ << "(sync) " << txc << " " << std::hex
<< txc->released << std::dec << dendl;
- alloc->release(txc->released);
+ shared_alloc.a->release(txc->released);
}
out:
_reap_collections();
logger->set(l_bluestore_fragmentation,
- (uint64_t)(alloc->get_fragmentation() * 1000));
+ (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
log_latency("kv_final",
l_bluestore_kv_final_lat,
PExtentVector prealloc;
prealloc.reserve(2 * wctx->writes.size());;
int64_t prealloc_left = 0;
- prealloc_left = alloc->allocate(
+ prealloc_left = shared_alloc.a->allocate(
need, min_alloc_size, need,
0, &prealloc);
if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
derr << __func__ << " failed to allocate 0x" << std::hex << need
<< " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
<< " min_alloc_size 0x" << min_alloc_size
- << " available 0x " << alloc->get_free()
+ << " available 0x " << shared_alloc.a->get_free()
<< std::dec << dendl;
if (prealloc.size()) {
- alloc->release(prealloc);
+ shared_alloc.a->release(prealloc);
}
return -ENOSPC;
}
BlockDevice *bdev = nullptr;
std::string freelist_type;
FreelistManager *fm = nullptr;
- Allocator *alloc = nullptr;
+
+ bluefs_shared_alloc_context_t shared_alloc;
+
uuid_d fsid;
int path_fd = -1; ///< open handle to $path
int fsid_fd = -1; ///< open handle (locked) to $path/fsid
int _minimal_open_bluefs(bool create);
void _minimal_close_bluefs();
- int _open_bluefs(bool create);
+ int _open_bluefs(bool create, bool read_only);
void _close_bluefs(bool cold_close);
// Limited (u)mount intended for BlueFS operations only
ls.push_back(new bluefs_transaction_t);
ls.push_back(new bluefs_transaction_t);
ls.back()->op_init();
- ls.back()->op_alloc_add(0, 0, 123123211);
- ls.back()->op_alloc_rm(1, 0, 123);
ls.back()->op_dir_create("dir");
ls.back()->op_dir_create("dir2");
bluefs_fnode_t fnode;
typedef enum {
OP_NONE = 0,
OP_INIT, ///< initial (empty) file system marker
- OP_ALLOC_ADD, ///< add extent to available block storage (extent)
- OP_ALLOC_RM, ///< remove extent from available block storage (extent)
+ OP_ALLOC_ADD, ///< OBSOLETE: add extent to available block storage (extent)
+ OP_ALLOC_RM, ///< OBSOLETE: remove extent from available block storage (extent)
OP_DIR_LINK, ///< (re)set a dir entry (dirname, filename, ino)
OP_DIR_UNLINK, ///< remove a dir entry (dirname, filename)
OP_DIR_CREATE, ///< create a dir (dirname)
using ceph::encode;
encode((__u8)OP_INIT, op_bl);
}
- void op_alloc_add(uint8_t id, uint64_t offset, uint64_t length) {
- using ceph::encode;
- encode((__u8)OP_ALLOC_ADD, op_bl);
- encode(id, op_bl);
- encode(offset, op_bl);
- encode(length, op_bl);
- }
- void op_alloc_rm(uint8_t id, uint64_t offset, uint64_t length) {
- using ceph::encode;
- encode((__u8)OP_ALLOC_RM, op_bl);
- encode(id, op_bl);
- encode(offset, op_bl);
- encode(length, op_bl);
- }
void op_dir_create(const std::string& dir) {
using ceph::encode;
encode((__u8)OP_DIR_CREATE, op_bl);
WRITE_CLASS_ENCODER(bluefs_transaction_t)
std::ostream& operator<<(std::ostream& out, const bluefs_transaction_t& t);
-
#endif
cout << " -> " << target_path;
}
cout << std::endl;
- int r = fs->add_block_device(e.second, e.first, false);
+ int r = fs->add_block_device(e.second, e.first, false, 0); // 'reserved' is fake
if (r < 0) {
cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
}
-BlueFS *open_bluefs(
+BlueFS *open_bluefs_readonly(
CephContext *cct,
const string& path,
const vector<string>& devs)
{
validate_path(cct, path, true);
- BlueFS *fs = new BlueFS(cct);
+ // We provide no shared allocator which prevents bluefs to operate in R/W mode.
+ // Read-only mode isn't strictly enforced though
+ BlueFS *fs = new BlueFS(cct, nullptr);
add_devices(fs, cct, devs);
const vector<string>& devs)
{
validate_path(cct, path, true);
- BlueFS *fs = new BlueFS(cct);
+ // We provide no shared allocator which prevents bluefs to operate in R/W mode.
+ // Read-only mode isn't strictly enforced though
+ BlueFS *fs = new BlueFS(cct, nullptr);
add_devices(fs, cct, devs);
int r = fs->log_dump();
}
}
else if (action == "bluefs-export") {
- BlueFS *fs = open_bluefs(cct.get(), path, devs);
+ BlueFS *fs = open_bluefs_readonly(cct.get(), path, devs);
vector<string> dirs;
int r = fs->readdir("", &dirs);
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
uuid_d fsid;
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
}
TEST(BlueFS, mkfs_mount) {
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
fs.umount();
}
-TEST(BlueFS, mkfs_mount_duplicate_gift) {
- uint64_t size = 1048576 * 128;
- TempBdev bdev{ size };
- bluefs_extent_t dup_ext;
- {
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
- uuid_d fsid;
- ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
- ASSERT_EQ(0, fs.mount());
-
- {
- BlueFS::FileWriter *h;
- ASSERT_EQ(0, fs.mkdir("dir"));
- ASSERT_EQ(0, fs.open_for_write("dir", "file1", &h, false));
- h->append("foo", 3);
- h->append("bar", 3);
- h->append("baz", 3);
- fs.fsync(h);
- ceph_assert(h->file->fnode.extents.size() > 0);
- dup_ext = h->file->fnode.extents[0];
- ceph_assert(dup_ext.bdev == BlueFS::BDEV_DB);
- fs.close_writer(h);
- }
-
- fs.umount();
- }
-
- {
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- ASSERT_EQ(0, fs.mount());
- // free allocation presumably allocated for file1
- std::cout << "duplicate extent: " << std::hex
- << dup_ext.offset << "~" << dup_ext.length
- << std::dec << std::endl;
- fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, dup_ext.offset, dup_ext.length);
- {
- // overwrite file1 with file2
- BlueFS::FileWriter *h;
- ASSERT_EQ(0, fs.open_for_write("dir", "file2", &h, false));
- h->append("foo", 3);
- h->append("bar", 3);
- h->append("baz", 3);
- fs.fsync(h);
- fs.close_writer(h);
- }
- fs.umount();
- }
-
- g_ceph_context->_conf.set_val_or_die("bluefs_log_replay_check_allocations", "true");
- g_ceph_context->_conf.apply_changes(nullptr);
-
- {
- // this should fail
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- ASSERT_NE(0, fs.mount());
- }
-}
-
TEST(BlueFS, write_read) {
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
TEST(BlueFS, small_appends) {
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
// we'll write a ~5G file, so allocate more than that for the whole fs
uint64_t size = 1048576 * 1024 * 8ull;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
+ BlueFS fs(g_ceph_context, nullptr);
bool old = g_ceph_context->_conf.get_val<bool>("bluefs_buffered_io");
g_ceph_context->_conf.set_val("bluefs_buffered_io", "false");
uint64_t total_written = 0;
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"65536");
g_ceph_context->_conf.apply_changes(nullptr);
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"65536");
g_ceph_context->_conf.apply_changes(nullptr);
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"65536");
g_ceph_context->_conf.apply_changes(nullptr);
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
uint64_t size = 1048576 * 128;
TempBdev bdev{size};
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"bluefs_compact_log_sync",
"true");
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"bluefs_compact_log_sync",
"false");
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
"bluefs_compact_log_sync",
"false");
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());
conf.SetVal("bluefs_sync_write", "true");
conf.ApplyChanges();
- BlueFS fs(g_ceph_context);
- ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
- fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576);
+ BlueFS fs(g_ceph_context, nullptr);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576));
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
ASSERT_EQ(0, fs.mount());