From: Igor Fedotov Date: Fri, 21 Aug 2020 09:09:43 +0000 (+0300) Subject: os/bluestore: get rid of obsolete stuff in bluefs. X-Git-Tag: v16.1.0~1128^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=50e44ac8c347e43a3a5c69a7afe7e16dd30bd419;p=ceph.git os/bluestore: get rid of obsolete stuff in bluefs. This primarily touches legacy BlueFS code intended to maintain main device space gifting/reclaiming. Which is obsolete duto to using single main device allocator shared among BlueStore and BlueFS. Fixes: https://tracker.ceph.com/issues/46886 Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index d3ad01da1554..648a23ca052a 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -120,10 +120,8 @@ private: f->dump_string("device", bluefs->get_device_name(dev)); ceph_assert(bluefs->alloc[dev]); auto total = bluefs->get_total(dev); - auto free = bluefs->alloc[dev]->get_free(); - auto used = bluefs->alloc[dev] == bluefs->shared_bdev_alloc ? - bluefs->shared_bdev_used.load() : - total - free; + auto free = bluefs->get_free(dev); + auto used = bluefs->get_used(dev); f->dump_int("total", total); f->dump_int("free", free); @@ -172,14 +170,16 @@ private: } }; -BlueFS::BlueFS(CephContext* cct) +BlueFS::BlueFS(CephContext* cct, + bluefs_shared_alloc_context_t* _shared_alloc) : cct(cct), bdev(MAX_BDEV), ioc(MAX_BDEV), - block_all(MAX_BDEV), + block_reserved(MAX_BDEV), alloc(MAX_BDEV), alloc_size(MAX_BDEV, 0), - pending_release(MAX_BDEV) + pending_release(MAX_BDEV), + shared_alloc(_shared_alloc) { discard_cb[BDEV_WAL] = wal_discard_cb; discard_cb[BDEV_DB] = db_discard_cb; @@ -302,35 +302,30 @@ void BlueFS::_update_logger_stats() logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size); if (alloc[BDEV_WAL]) { - logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size()); - logger->set(l_bluefs_wal_used_bytes, - block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free()); + logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL)); + logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL)); } if (alloc[BDEV_DB]) { - logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size()); - uint64_t used = alloc[BDEV_DB] == shared_bdev_alloc ? - shared_bdev_used.load() : - block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free(); - logger->set(l_bluefs_db_used_bytes, used); + logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB)); + logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB)); } if (alloc[BDEV_SLOW]) { - logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size()); - uint64_t used = alloc[BDEV_SLOW] == shared_bdev_alloc ? - shared_bdev_used.load() : - block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free(); - logger->set(l_bluefs_slow_used_bytes, used); + logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW)); + logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW)); } } int BlueFS::add_block_device(unsigned id, const string& path, bool trim, - bool shared_with_bluestore, - Allocator* _shared_bdev_alloc) + uint64_t reserved, + bool shared_with_bluestore) { - dout(10) << __func__ << " bdev " << id << " path " << path << dendl; + dout(10) << __func__ << " bdev " << id << " path " << path << " " + << reserved << dendl; ceph_assert(id < bdev.size()); ceph_assert(bdev[id] == NULL); BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast(this)); + block_reserved[id] = reserved; if (shared_with_bluestore) { b->set_no_exclusive_lock(); } @@ -347,10 +342,10 @@ int BlueFS::add_block_device(unsigned id, const string& path, bool trim, << " size " << byte_u_t(b->get_size()) << dendl; bdev[id] = b; ioc[id] = new IOContext(cct, NULL); - if (_shared_bdev_alloc) { - ceph_assert(shared_bdev_alloc == nullptr); - alloc[id] = shared_bdev_alloc = _shared_bdev_alloc; - need_shared_alloc_init = true; + if (shared_with_bluestore) { + ceph_assert(shared_alloc); // to be set in ctor before + alloc[id] = shared_alloc->a; + shared_alloc_id = id; } return 0; } @@ -362,47 +357,20 @@ bool BlueFS::bdev_support_label(unsigned id) return bdev[id]->supported_bdev_label(); } -uint64_t BlueFS::get_block_device_size(unsigned id) +uint64_t BlueFS::get_block_device_size(unsigned id) const { if (id < bdev.size() && bdev[id]) return bdev[id]->get_size(); return 0; } -void BlueFS::_add_block_extent(bool create, unsigned id, uint64_t offset, - uint64_t length, bool skip) -{ - dout(1) << __func__ << " bdev " << id - << " create " << create - << " 0x" << std::hex << offset << "~" << length << std::dec - << " skip " << skip - << dendl; - - ceph_assert(id < bdev.size()); - ceph_assert(bdev[id]); - ceph_assert(bdev[id]->get_size() >= offset + length); - block_all[id].insert(offset, length); - - if (!create) { - ceph_assert(id < alloc.size()); - ceph_assert(alloc[id]); - if (!skip) - log_t.op_alloc_add(id, offset, length); - if (alloc[id] != shared_bdev_alloc) { - alloc[id]->init_add_free(offset, length); - } - } - - dout(10) << __func__ << " done" << dendl; -} - void BlueFS::handle_discard(unsigned id, interval_set& to_release) { dout(10) << __func__ << " bdev " << id << dendl; ceph_assert(alloc[id]); alloc[id]->release(to_release); - if (alloc[id] == shared_bdev_alloc) { - shared_bdev_used -= to_release.size(); + if (is_shared_alloc(id)) { + shared_alloc->bluefs_used -= to_release.size(); } } @@ -411,36 +379,44 @@ uint64_t BlueFS::get_used() std::lock_guard l(lock); uint64_t used = 0; for (unsigned id = 0; id < MAX_BDEV; ++id) { - if (alloc[id]) { - if (alloc[id] != shared_bdev_alloc) { - used += block_all[id].size() - alloc[id]->get_free(); - } else { - used += shared_bdev_used; - } - } + used += _get_used(id); } return used; } -uint64_t BlueFS::get_used(unsigned id) +uint64_t BlueFS::_get_used(unsigned id) const { - ceph_assert(id < alloc.size()); - ceph_assert(alloc[id]); - std::lock_guard l(lock); uint64_t used = 0; - if (alloc[id] != shared_bdev_alloc) { - used = block_all[id].size() - alloc[id]->get_free(); + if (!alloc[id]) + return 0; + + if (is_shared_alloc(id)) { + used = shared_alloc->bluefs_used; } else { - used += shared_bdev_used; + used = _get_total(id) - alloc[id]->get_free(); } return used; } +uint64_t BlueFS::get_used(unsigned id) +{ + ceph_assert(id < alloc.size()); + ceph_assert(alloc[id]); + std::lock_guard l(lock); + return _get_used(id); +} + +uint64_t BlueFS::_get_total(unsigned id) const +{ + ceph_assert(id < bdev.size()); + ceph_assert(id < block_reserved.size()); + return get_block_device_size(id) - block_reserved[id]; +} + uint64_t BlueFS::get_total(unsigned id) { std::lock_guard l(lock); - ceph_assert(id < block_all.size()); - return block_all[id].size(); + return _get_total(id); } uint64_t BlueFS::get_free(unsigned id) @@ -463,57 +439,28 @@ void BlueFS::dump_block_extents(ostream& out) if (!bdev[i]) { continue; } - auto owned = get_total(i); + auto total = get_total(i); auto free = get_free(i); - out << i << " : device size 0x" << std::hex << bdev[i]->get_size() - << " : own 0x" << block_all[i] - << " = 0x" << owned - << " : using 0x" << owned - free - << std::dec << "(" << byte_u_t(owned - free) << ")"; + out << i << " : device size 0x" << std::hex << total + << " : using 0x" << total - free + << std::dec << "(" << byte_u_t(total - free) << ")"; out << "\n"; } } -void BlueFS::get_usage(vector> *usage) -{ - std::lock_guard l(lock); - usage->resize(bdev.size()); - for (unsigned id = 0; id < bdev.size(); ++id) { - if (!bdev[id]) { - (*usage)[id] = make_pair(0, 0); - continue; - } - (*usage)[id].first = alloc[id]->get_free(); - (*usage)[id].second = block_all[id].size(); - uint64_t used = - (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size(); - dout(10) << __func__ << " bdev " << id - << " free " << (*usage)[id].first - << " (" << byte_u_t((*usage)[id].first) << ")" - << " / " << (*usage)[id].second - << " (" << byte_u_t((*usage)[id].second) << ")" - << ", used " << used << "%" - << dendl; - } -} - int BlueFS::get_block_extents(unsigned id, interval_set *extents) { std::lock_guard l(lock); dout(10) << __func__ << " bdev " << id << dendl; - if (id >= block_all.size()) + if (id >= alloc.size()) return -EINVAL; - if (alloc[id] && alloc[id] == shared_bdev_alloc) { - for (auto& p : file_map) { - for (auto& q : p.second->fnode.extents) { - if (alloc[q.bdev] == shared_bdev_alloc) { - extents->insert(q.offset, q.length); - } + for (auto& p : file_map) { + for (auto& q : p.second->fnode.extents) { + if (q.bdev == id && alloc[q.bdev] == shared_alloc->a) { + extents->insert(q.offset, q.length); } } - } else { - *extents = block_all[id]; } return 0; } @@ -557,17 +504,6 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) // initial txn log_t.op_init(); - for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { - interval_set& p = block_all[bdev]; - if (p.empty()) - continue; - for (interval_set::iterator q = p.begin(); q != p.end(); ++q) { - dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" - << std::hex << q.get_start() << "~" << q.get_len() << std::dec - << dendl; - log_t.op_alloc_add(bdev, q.get_start(), q.get_len()); - } - } _flush_and_sync_log(l); // write supers @@ -580,11 +516,13 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) super = bluefs_super_t(); _close_writer(log_writer); log_writer = NULL; - block_all.clear(); vselector.reset(nullptr); _stop_alloc(); _shutdown_logger(); - need_shared_alloc_init = false; + if (shared_alloc) { + ceph_assert(shared_alloc->need_init); + shared_alloc->need_init = false; + } dout(10) << __func__ << " success" << dendl; return 0; @@ -593,7 +531,6 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) void BlueFS::_init_alloc() { dout(20) << __func__ << dendl; - block_unused_too_granular.resize(MAX_BDEV); if (bdev[BDEV_WAL]) { alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size; @@ -618,11 +555,10 @@ void BlueFS::_init_alloc() } ceph_assert(bdev[id]->get_size()); ceph_assert(alloc_size[id]); - if (alloc[id]) { + if (is_shared_alloc(id)) { dout(1) << __func__ << " shared, id " << id << " alloc_size 0x" << std::hex << alloc_size[id] << " size 0x" << bdev[id]->get_size() << std::dec << dendl; - shared_bdev_used = 0; } else { std::string name = "bluefs-"; const char* devnames[] = { "wal","db","slow" }; @@ -636,10 +572,9 @@ void BlueFS::_init_alloc() alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, bdev[id]->get_size(), alloc_size[id], name); - interval_set& p = block_all[id]; - for (interval_set::iterator q = p.begin(); q != p.end(); ++q) { - alloc[id]->init_add_free(q.get_start(), q.get_len()); - } + alloc[id]->init_add_free( + block_reserved[id], + _get_total(id) - block_reserved[id]); } } } @@ -653,19 +588,19 @@ void BlueFS::_stop_alloc() } for (size_t i = 0; i < alloc.size(); ++i) { - if (alloc[i] && alloc[i] != shared_bdev_alloc) { + if (alloc[i] && !is_shared_alloc(i)) { alloc[i]->shutdown(); delete alloc[i]; alloc[i] = nullptr; } } - block_unused_too_granular.clear(); } int BlueFS::mount() { dout(1) << __func__ << dendl; + bool shared_alloc_ready = shared_alloc && shared_alloc->a; int r = _open_super(); if (r < 0) { derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl; @@ -681,8 +616,6 @@ int BlueFS::mount() get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100)); } - block_all.clear(); - block_all.resize(MAX_BDEV); _init_alloc(); _init_logger(); @@ -697,18 +630,25 @@ int BlueFS::mount() for (auto& p : file_map) { dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl; for (auto& q : p.second->fnode.extents) { - if (alloc[q.bdev] == shared_bdev_alloc) { - if (need_shared_alloc_init) { + if (is_shared_alloc(q.bdev)) { + // we might have still uninitialized shared_alloc at this point + // just bypass initialization then + if (shared_alloc_ready && shared_alloc->need_init) { + ceph_assert(shared_alloc->a); alloc[q.bdev]->init_rm_free(q.offset, q.length); - shared_bdev_used += q.length; + shared_alloc->bluefs_used += q.length; } } else { alloc[q.bdev]->init_rm_free(q.offset, q.length); } } } - need_shared_alloc_init = false; - dout(1) << __func__ << " shared_bdev_used = " << shared_bdev_used << dendl; + if (shared_alloc_ready) { + shared_alloc->need_init = false; + } + dout(1) << __func__ << " shared_bdev_used = " + << (shared_alloc_ready ? (int64_t)shared_alloc->bluefs_used : -1) + << dendl; // set up the log for future writes log_writer = _create_writer(_get_file(1)); @@ -872,7 +812,6 @@ int BlueFS::_open_super() int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode, size_t dev_count, - boost::dynamic_bitset* owned_blocks, boost::dynamic_bitset* used_blocks) { auto& fnode_extents = fnode.extents; @@ -880,21 +819,6 @@ int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode, auto id = e.bdev; bool fail = false; ceph_assert(id < dev_count); - apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (!bs.test(pos)) { - fail = true; - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << int(id) - << ": 0x" << std::hex << e.offset << "~" << e.length - << std::dec - << ": wasn't given but allocated for ino " << fnode.ino - << dendl; - return -EFAULT; - } apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id], [&](uint64_t pos, boost::dynamic_bitset &bs) { @@ -915,51 +839,6 @@ int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode, return 0; } -int BlueFS::_adjust_granularity( - __u8 id, uint64_t *offset, uint64_t *length, bool alloc) -{ - const char *op = alloc ? "op_alloc_add" : "op_alloc_rm"; - auto oldo = *offset; - auto oldl = *length; - if (*offset & (alloc_size[id] - 1)) { - *offset &= ~(alloc_size[id] - 1); - *offset += alloc_size[id]; - if (*length > *offset - oldo) { - if (alloc) { - block_unused_too_granular[id].insert(oldo, *offset - oldo); - } else { - block_unused_too_granular[id].erase(oldo, *offset - oldo); - } - *length -= (*offset - oldo); - } else { - if (alloc) { - block_unused_too_granular[id].insert(oldo, *length); - } else { - block_unused_too_granular[id].erase(oldo, *length); - } - *length = 0; - } - } - if (*length & (alloc_size[id] - 1)) { - *length &= ~(alloc_size[id] - 1); - if (alloc) { - block_unused_too_granular[id].insert( - *offset + *length, - oldo + oldl - *offset - *length); - } else { - block_unused_too_granular[id].erase( - *offset + *length, - oldo + oldl - *offset - *length); - } - } - if (oldo != *offset || oldl != *length) { - dout(10) << __func__ << " " << op << " " - << (int)id << ":" << std::hex << oldo << "~" << oldl - << " -> " << (int)id << ":" << *offset << "~" << *length << dendl; - } - return 0; -} - int BlueFS::_verify_alloc_granularity( __u8 id, uint64_t offset, uint64_t length, const char *op) { @@ -1000,11 +879,6 @@ int BlueFS::_replay(bool noop, bool to_stdout) FileRef log_file; log_file = _get_file(1); - // sanity check - for (auto& a : block_unused_too_granular) { - ceph_assert(a.empty()); - } - log_file->fnode = super.log_fnode; if (!noop) { log_file->vselector_hint = @@ -1028,14 +902,12 @@ int BlueFS::_replay(bool noop, bool to_stdout) bool seen_recs = false; boost::dynamic_bitset used_blocks[MAX_BDEV]; - boost::dynamic_bitset owned_blocks[MAX_BDEV]; if (!noop) { if (cct->_conf->bluefs_log_replay_check_allocations) { for (size_t i = 0; i < MAX_BDEV; ++i) { if (alloc_size[i] != 0 && bdev[i] != nullptr) { used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]); - owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]); } } } @@ -1211,123 +1083,26 @@ int BlueFS::_replay(bool noop, bool to_stdout) break; case bluefs_transaction_t::OP_ALLOC_ADD: + // LEGACY, do nothing but read params { - __u8 id; - uint64_t offset, length; - decode(id, p); - decode(offset, p); - decode(length, p); - dout(20) << __func__ << " 0x" << std::hex << pos << std::dec - << ": op_alloc_add " << " " << (int)id - << ":0x" << std::hex << offset << "~" << length << std::dec - << dendl; - if (unlikely(to_stdout)) { - std::cout << " 0x" << std::hex << pos << std::dec - << ": op_alloc_add " << " " << (int)id - << ":0x" << std::hex << offset << "~" << length << std::dec - << std::endl; - } - if (!noop) { - block_all[id].insert(offset, length); - _adjust_granularity(id, &offset, &length, true); - if (length && - alloc[id] != shared_bdev_alloc) { - alloc[id]->init_add_free(offset, length); - } - - if (cct->_conf->bluefs_log_replay_check_allocations) { - bool fail = false; - apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (bs.test(pos)) { - fail = true; - } else { - bs.set(pos); - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << (int)id - << ": 0x" << std::hex << offset << "~" << length - << std::dec << ": already given" << dendl; - return -EFAULT; - } - apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (bs.test(pos)) { - fail = true; - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << int(id) - << ": 0x" << std::hex << offset << "~" << length - << std::dec << ": already in use" << dendl; - return -EFAULT; - } - } - } - } + __u8 id; + uint64_t offset, length; + decode(id, p); + decode(offset, p); + decode(length, p); + } break; case bluefs_transaction_t::OP_ALLOC_RM: + // LEGACY, do nothing but read params { - __u8 id; - uint64_t offset, length; - decode(id, p); - decode(offset, p); - decode(length, p); - dout(20) << __func__ << " 0x" << std::hex << pos << std::dec - << ": op_alloc_rm " << " " << (int)id - << ":0x" << std::hex << offset << "~" << length << std::dec - << dendl; - if (unlikely(to_stdout)) { - std::cout << " 0x" << std::hex << pos << std::dec - << ": op_alloc_rm " << " " << (int)id - << ":0x" << std::hex << offset << "~" << length << std::dec - << std::endl; - } - if (!noop) { - block_all[id].erase(offset, length); - _adjust_granularity(id, &offset, &length, false); - if (length && alloc[id] != shared_bdev_alloc) { - alloc[id]->init_rm_free(offset, length); - } - if (cct->_conf->bluefs_log_replay_check_allocations) { - bool fail = false; - apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (!bs.test(pos)) { - fail = true; - } else { - bs.reset(pos); - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << int(id) - << ": 0x" << std::hex << offset << "~" << length - << std::dec << ": wasn't given" << dendl; - return -EFAULT; - } - - apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (bs.test(pos)) { - fail = true; - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << (int)id - << ": 0x" << std::hex << offset << "~" << length - << std::dec << ": still in use" << dendl; - return -EFAULT; - } - } - } - } - break; + __u8 id; + uint64_t offset, length; + decode(id, p); + decode(offset, p); + decode(length, p); + } + break; case bluefs_transaction_t::OP_DIR_LINK: { @@ -1448,7 +1223,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (first_log_check) { first_log_check = false; int r = _check_new_allocations(log_file->fnode, - MAX_BDEV, owned_blocks, used_blocks); + MAX_BDEV, used_blocks); if (r < 0) { return r; } @@ -1484,7 +1259,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) } if (cct->_conf->bluefs_log_replay_check_allocations) { int r = _check_new_allocations(f->fnode, - MAX_BDEV, owned_blocks, used_blocks); + MAX_BDEV, used_blocks); if (r < 0) { return r; } @@ -1513,21 +1288,6 @@ int BlueFS::_replay(bool noop, bool to_stdout) for (auto e : fnode_extents) { auto id = e.bdev; bool fail = false; - apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id], - [&](uint64_t pos, boost::dynamic_bitset &bs) { - if (!bs.test(pos)) { - fail = true; - } - } - ); - if (fail) { - derr << __func__ << " invalid extent " << int(id) - << ": 0x" << std::hex << e.offset << "~" << e.length - << std::dec - << ": wasn't given but is allocated for removed ino " << ino - << dendl; - return -EFAULT; - } apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id], [&](uint64_t pos, boost::dynamic_bitset &bs) { @@ -1571,7 +1331,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (!noop && first_log_check && cct->_conf->bluefs_log_replay_check_allocations) { int r = _check_new_allocations(log_file->fnode, - MAX_BDEV, owned_blocks, used_blocks); + MAX_BDEV, used_blocks); if (r < 0) { return r; } @@ -1598,10 +1358,6 @@ int BlueFS::_replay(bool noop, bool to_stdout) } } - for (unsigned id = 0; id < block_unused_too_granular.size(); ++id) { - dout(10) << __func__ << " block_unused_too_granular " << id << ": " - << block_unused_too_granular[id] << dendl; - } dout(10) << __func__ << " done" << dendl; return 0; } @@ -1713,8 +1469,8 @@ int BlueFS::device_migrate_to_existing( PExtentVector to_release; to_release.emplace_back(old_ext.offset, old_ext.length); alloc[old_ext.bdev]->release(to_release); - if (alloc[old_ext.bdev] == shared_bdev_alloc) { - shared_bdev_used -= to_release.size(); + if (is_shared_alloc(old_ext.bdev)) { + shared_alloc->bluefs_used -= to_release.size(); } } @@ -1854,8 +1610,8 @@ int BlueFS::device_migrate_to_new( PExtentVector to_release; to_release.emplace_back(old_ext.offset, old_ext.length); alloc[old_ext.bdev]->release(to_release); - if (alloc[old_ext.bdev] == shared_bdev_alloc) { - shared_bdev_used -= to_release.size(); + if (is_shared_alloc(old_ext.bdev)) { + shared_alloc->bluefs_used -= to_release.size(); } } @@ -2172,8 +1928,6 @@ uint64_t BlueFS::_estimate_log_size() int avg_file_size = 12; uint64_t size = 4096 * 2; size += file_map.size() * (1 + sizeof(bluefs_fnode_t)); - for (auto& p : block_all) - size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2); size += dir_map.size() + (1 + avg_dir_size); size += file_map.size() * (1 + avg_dir_size + avg_file_size); return round_up_to(size, super.block_size); @@ -2217,38 +1971,6 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, dout(20) << __func__ << " op_init" << dendl; t->op_init(); - for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { - interval_set& p = block_all[bdev]; - for (interval_set::iterator q = p.begin(); q != p.end(); ++q) { - auto bdev_new = bdev; - if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) { - continue; - } - if ((flags & REMOVE_DB) && bdev == BDEV_DB) { - continue; - } - if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { - bdev_new = BDEV_DB; - } - if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { - bdev_new = BDEV_SLOW; - } - if (bdev == BDEV_NEWDB) { - // REMOVE_DB xor RENAME_DB - ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); - ceph_assert(!(flags & RENAME_SLOW2DB)); - bdev_new = BDEV_DB; - } - if (bdev == BDEV_NEWWAL) { - ceph_assert(flags & REMOVE_WAL); - bdev_new = BDEV_WAL; - } - dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x" - << std::hex << q.get_start() << "~" << q.get_len() << std::dec - << dendl; - t->op_alloc_add(bdev_new, q.get_start(), q.get_len()); - } - } for (auto& [ino, file_ref] : file_map) { if (ino == 1) continue; @@ -2733,8 +2455,8 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, } } alloc[i]->release(to_release[i]); - if (alloc[i] == shared_bdev_alloc) { - shared_bdev_used -= to_release[i].size(); + if (is_shared_alloc(i)) { + shared_alloc->bluefs_used -= to_release[i].size(); } } } @@ -3105,7 +2827,7 @@ void BlueFS::flush_bdev() for (unsigned i = 0; i < MAX_BDEV; i++) { // alloc space from BDEV_SLOW is unexpected. // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device. - if (bdev[i] && ((i != BDEV_SLOW) || (block_all[i].size() - alloc[i]->get_free()))) { + if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) { bdev[i]->flush(); } } @@ -3146,8 +2868,8 @@ int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, alloc[id]->dump(); return -ENOSPC; } - if (alloc[id] == shared_bdev_alloc) { - shared_bdev_used += alloc_len; + if (is_shared_alloc(id)) { + shared_alloc->bluefs_used += alloc_len; } return 0; @@ -3192,14 +2914,13 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, << std::dec << dendl; return -ENOSPC; } else { - uint64_t total_allocated = - block_all[id].size() - alloc[id]->get_free(); - if (max_bytes[id] < total_allocated) { - logger->set(max_bytes_pcounters[id], total_allocated); - max_bytes[id] = total_allocated; + uint64_t used = _get_used(id); + if (max_bytes[id] < used) { + logger->set(max_bytes_pcounters[id], used); + max_bytes[id] = used; } - if (alloc[id] == shared_bdev_alloc) { - shared_bdev_used += alloc_len; + if (is_shared_alloc(id)) { + shared_alloc->bluefs_used += alloc_len; } } @@ -3834,16 +3555,6 @@ int BlueFS::do_replay_recovery_read(FileReader *log_reader, return 0; } -void BlueFS::debug_inject_duplicate_gift(unsigned id, - uint64_t offset, - uint64_t len) -{ - dout(0) << __func__ << dendl; - if (id < alloc.size() && alloc[id]) { - alloc[id]->init_add_free(offset, len); - } -} - // =============================================== // OriginalVolumeSelector diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index c8e2063ce79a..b14b94b911d7 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -70,7 +70,22 @@ public: virtual void get_paths(const std::string& base, paths& res) const = 0; virtual void dump(std::ostream& sout) = 0; }; -class BlueFS; + +struct bluefs_shared_alloc_context_t { + bool need_init = false; + Allocator* a = nullptr; + + std::atomic bluefs_used = 0; + + void set(Allocator* _a) { + a = _a; + need_init = true; + bluefs_used = 0; + } + void reset() { + a = nullptr; + } +}; class BlueFS { public: @@ -300,19 +315,20 @@ private: */ std::vector bdev; ///< block devices we can use std::vector ioc; ///< IOContexts for bdevs - std::vector > block_all; ///< extents in bdev we own + std::vector block_reserved; ///< starting reserve extent per device std::vector alloc; ///< allocators for bdevs std::vector alloc_size; ///< alloc size for each device std::vector> pending_release; ///< extents to release - std::vector> block_unused_too_granular; + //std::vector> block_unused_too_granular; BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev std::unique_ptr vselector; - bool need_shared_alloc_init = false; - Allocator* shared_bdev_alloc = nullptr; - std::atomic shared_bdev_used = 0; - + bluefs_shared_alloc_context_t* shared_alloc = nullptr; + unsigned shared_alloc_id = unsigned(-1); + inline bool is_shared_alloc(unsigned id) const { + return id == shared_alloc_id; + } class SocketHook; SocketHook* asok_hook = nullptr; @@ -326,6 +342,10 @@ private: void _pad_bl(ceph::buffer::list& bl); ///< pad ceph::buffer::list to block size w/ zeros + uint64_t _get_used(unsigned id) const; + uint64_t _get_total(unsigned id) const; + + FileRef _get_file(uint64_t ino); void _drop_link(FileRef f); @@ -399,13 +419,10 @@ private: int _write_super(int dev); int _check_new_allocations(const bluefs_fnode_t& fnode, size_t dev_count, - boost::dynamic_bitset* owned_blocks, boost::dynamic_bitset* used_blocks); int _verify_alloc_granularity( __u8 id, uint64_t offset, uint64_t length, const char *op); - int _adjust_granularity( - __u8 id, uint64_t *offset, uint64_t *length, bool alloc); int _replay(bool noop, bool to_stdout = false); ///< replay journal FileWriter *_create_writer(FileRef f); @@ -420,11 +437,8 @@ private: return 4096; } - void _add_block_extent(bool create, unsigned bdev, uint64_t offset, - uint64_t len, bool skip=false); - public: - BlueFS(CephContext* cct); + BlueFS(CephContext* cct, bluefs_shared_alloc_context_t* _shared_alloc); ~BlueFS(); // the super is always stored on bdev 0 @@ -458,7 +472,6 @@ public: uint64_t get_total(unsigned id); uint64_t get_free(unsigned id); uint64_t get_used(unsigned id); - void get_usage(std::vector> *usage); // [ ...] void dump_perf_counters(ceph::Formatter *f); void dump_block_extents(std::ostream& out); @@ -519,19 +532,10 @@ public: } int add_block_device(unsigned bdev, const std::string& path, bool trim, - bool shared_with_bluestore = false, - Allocator* shared_bdev_alloc = nullptr); + uint64_t reserved, + bool shared_with_bluestore = false); bool bdev_support_label(unsigned id); - uint64_t get_block_device_size(unsigned bdev); - - /// gift more block space - void add_block_extent(bool create, unsigned bdev, uint64_t offset, uint64_t len, - bool skip=false) { - std::unique_lock l(lock); - _add_block_extent(create, bdev, offset, len, skip); - int r = _flush_and_sync_log(l); - ceph_assert(r == 0); - } + uint64_t get_block_device_size(unsigned bdev) const; // handler for discard event void handle_discard(unsigned dev, interval_set& to_release); @@ -590,7 +594,6 @@ public: bufferlist* bl); /// test purpose methods - void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len); const PerfCounters* get_perf_counters() const { return logger; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 331559da0bab..f66da6a5556f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4147,8 +4147,8 @@ static void discard_cb(void *priv, void *priv2) void BlueStore::handle_discard(interval_set& to_release) { dout(10) << __func__ << dendl; - ceph_assert(alloc); - alloc->release(to_release); + ceph_assert(shared_alloc.a); + shared_alloc.a->release(to_release); } BlueStore::BlueStore(CephContext *cct, const string& path) @@ -4916,7 +4916,6 @@ int BlueStore::_open_bdev(bool create) void BlueStore::_validate_bdev() { ceph_assert(bdev); - ceph_assert(min_alloc_size); // _get_odisk_reserved depends on that uint64_t dev_size = bdev->get_size(); ceph_assert(dev_size > _get_ondisk_reserved()); } @@ -5058,7 +5057,7 @@ int BlueStore::_write_out_fm_meta(uint64_t target_size) int BlueStore::_open_alloc() { - ceph_assert(alloc == NULL); + ceph_assert(shared_alloc.a == NULL); ceph_assert(bdev->get_size()); uint64_t alloc_size = min_alloc_size; @@ -5069,11 +5068,11 @@ int BlueStore::_open_alloc() alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } - alloc = Allocator::create(cct, cct->_conf->bluestore_allocator, + shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator, bdev->get_size(), - alloc_size, "block"); + alloc_size, "block")); - if (!alloc) { + if (!shared_alloc.a) { lderr(cct) << __func__ << " Allocator::unknown alloc type " << cct->_conf->bluestore_allocator << dendl; @@ -5081,7 +5080,7 @@ int BlueStore::_open_alloc() } if (bdev->is_smr()) { - alloc->set_zone_states(fm->get_zone_states(db)); + shared_alloc.a->set_zone_states(fm->get_zone_states(db)); } uint64_t num = 0, bytes = 0; @@ -5091,7 +5090,7 @@ int BlueStore::_open_alloc() fm->enumerate_reset(); uint64_t offset, length; while (fm->enumerate_next(db, &offset, &length)) { - alloc->init_add_free(offset, length); + shared_alloc.a->init_add_free(offset, length); ++num; bytes += length; } @@ -5099,7 +5098,7 @@ int BlueStore::_open_alloc() dout(1) << __func__ << " loaded " << byte_u_t(bytes) << " in " << num << " extents" - << " available " << byte_u_t(alloc->get_free()) + << " available " << byte_u_t(shared_alloc.a->get_free()) << dendl; return 0; @@ -5110,10 +5109,10 @@ void BlueStore::_close_alloc() ceph_assert(bdev); bdev->discard_drain(); - ceph_assert(alloc); - alloc->shutdown(); - delete alloc; - alloc = NULL; + ceph_assert(shared_alloc.a); + shared_alloc.a->shutdown(); + delete shared_alloc.a; + shared_alloc.reset(); } int BlueStore::_open_fsid(bool create) @@ -5275,7 +5274,7 @@ bool BlueStore::test_mount_in_use() int BlueStore::_minimal_open_bluefs(bool create) { int r; - bluefs = new BlueFS(cct); + bluefs = new BlueFS(cct, &shared_alloc); string bfn; struct stat st; @@ -5284,7 +5283,8 @@ int BlueStore::_minimal_open_bluefs(bool create) if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device( BlueFS::BDEV_DB, bfn, - create && cct->_conf->bdev_enable_discard); + create && cct->_conf->bdev_enable_discard, + SUPER_RESERVED); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; @@ -5303,13 +5303,6 @@ int BlueStore::_minimal_open_bluefs(bool create) goto free_bluefs; } } - if (create) { - bluefs->add_block_extent( - create, - BlueFS::BDEV_DB, - SUPER_RESERVED, - bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED); - } bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; bluefs_layout.dedicated_db = true; } else { @@ -5328,27 +5321,19 @@ int BlueStore::_minimal_open_bluefs(bool create) bfn = path + "/block"; // never trim here r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false, - true, - alloc); + 0, // no need to provide valid 'reserved' for shared dev + true); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; goto free_bluefs; } - if (create) { - auto reserved = _get_ondisk_reserved(); - - bluefs->add_block_extent( - create, - bluefs_layout.shared_bdev, - reserved, - p2align(bdev->get_size(), min_alloc_size) - reserved); - } bfn = path + "/block.wal"; if (::stat(bfn.c_str(), &st) == 0) { r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, - create && cct->_conf->bdev_enable_discard); + create && cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); if (r < 0) { derr << __func__ << " add block device(" << bfn << ") returned: " << cpp_strerror(r) << dendl; @@ -5367,13 +5352,6 @@ int BlueStore::_minimal_open_bluefs(bool create) } } - if (create) { - bluefs->add_block_extent( - create, - BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, - bluefs->get_block_device_size(BlueFS::BDEV_WAL) - - BDEV_LABEL_BLOCK_SIZE); - } bluefs_layout.dedicated_wal = true; } else { r = 0; @@ -5393,7 +5371,7 @@ free_bluefs: return r; } -int BlueStore::_open_bluefs(bool create) +int BlueStore::_open_bluefs(bool create, bool read_only) { int r = _minimal_open_bluefs(create); if (r < 0) { @@ -5487,6 +5465,7 @@ int BlueStore::_open_db_and_around(bool read_only) if (do_bluefs) { // open in read-only first to read FM list and init allocator // as they might be needed for some BlueFS procedures + r = _open_db(false, false, true); if (r < 0) return r; @@ -5662,7 +5641,7 @@ int BlueStore::_prepare_db_environment(bool create, bool read_only, return -EINVAL; } - r = _open_bluefs(create); + r = _open_bluefs(create, read_only); if (r < 0) { return r; } @@ -5833,7 +5812,7 @@ void BlueStore::_dump_alloc_on_failure() cct->_conf->bluestore_bluefs_alloc_failure_dump_interval; if (dump_interval > 0 && next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) { - alloc->dump(); + shared_alloc.a->dump(); next_dump_on_bluefs_alloc_failure = ceph_clock_now(); next_dump_on_bluefs_alloc_failure += dump_interval; } @@ -6180,15 +6159,15 @@ int BlueStore::mkfs() return r; alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size); } - alloc = Allocator::create(cct, cct->_conf->bluestore_allocator, + shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator, bdev->get_size(), - alloc_size, "block"); - if (!alloc) { + alloc_size, "block")); + if (!shared_alloc.a) { r = -EINVAL; goto out_close_bdev; } reserved = _get_ondisk_reserved(); - alloc->init_add_free(reserved, + shared_alloc.a->init_add_free(reserved, p2align(bdev->get_size(), min_alloc_size) - reserved); r = _open_db(true); @@ -6243,8 +6222,8 @@ int BlueStore::mkfs() out_close_db: _close_db(false); out_close_bdev: - delete alloc; - alloc = nullptr; + delete shared_alloc.a; + shared_alloc.reset(); _close_bdev(); out_close_fsid: _close_fsid(); @@ -6318,7 +6297,6 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) r = _mount_for_bluefs(); - int reserved = 0; if (id == BlueFS::BDEV_NEWWAL) { string p = path + "/block.wal"; r = _setup_block_symlink_or_file("block.wal", dev_path, @@ -6327,7 +6305,8 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) ceph_assert(r == 0); r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p, - cct->_conf->bdev_enable_discard); + cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { @@ -6339,7 +6318,6 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) ceph_assert(r == 0); } - reserved = BDEV_LABEL_BLOCK_SIZE; bluefs_layout.dedicated_wal = true; } else if (id == BlueFS::BDEV_NEWDB) { string p = path + "/block.db"; @@ -6349,7 +6327,8 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) ceph_assert(r == 0); r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p, - cct->_conf->bdev_enable_discard); + cct->_conf->bdev_enable_discard, + SUPER_RESERVED); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { @@ -6360,7 +6339,6 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) true); ceph_assert(r == 0); } - reserved = SUPER_RESERVED; bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW; bluefs_layout.dedicated_db = true; } @@ -6368,12 +6346,6 @@ int BlueStore::add_new_bluefs_device(int id, const string& dev_path) bluefs->umount(); bluefs->mount(); - bluefs->add_block_extent( - false, - id, - reserved, - bluefs->get_block_device_size(id) - reserved, true); - r = bluefs->prepare_new_device(id, bluefs_layout); ceph_assert(r == 0); @@ -6459,7 +6431,6 @@ int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, r = _mount_for_bluefs(); - int reserved = 0; string link_db; string link_wal; if (devs_source.count(BlueFS::BDEV_DB) && @@ -6481,7 +6452,8 @@ int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, bluefs_layout.dedicated_wal = true; r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path, - cct->_conf->bdev_enable_discard); + cct->_conf->bdev_enable_discard, + BDEV_LABEL_BLOCK_SIZE); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { @@ -6492,7 +6464,6 @@ int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, true); ceph_assert(r == 0); } - reserved = BDEV_LABEL_BLOCK_SIZE; } else if (id == BlueFS::BDEV_NEWDB) { target_name = "block.db"; target_size = cct->_conf->bluestore_block_db_size; @@ -6500,7 +6471,8 @@ int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, bluefs_layout.dedicated_db = true; r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path, - cct->_conf->bdev_enable_discard); + cct->_conf->bdev_enable_discard, + SUPER_RESERVED); ceph_assert(r == 0); if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { @@ -6511,16 +6483,11 @@ int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, true); ceph_assert(r == 0); } - reserved = SUPER_RESERVED; } bluefs->umount(); bluefs->mount(); - bluefs->add_block_extent( - false, - id, reserved, bluefs->get_block_device_size(id) - reserved); - r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout); if (r < 0) { @@ -6606,28 +6573,20 @@ int BlueStore::expand_devices(ostream& out) continue; } - interval_set before; - bluefs->get_block_extents(devid, &before); - ceph_assert(!before.empty()); - uint64_t end = before.range_end(); - if (end < size) { - out << devid - <<" : expanding " << " from 0x" << std::hex - << end << " to 0x" << size << std::dec << std::endl; - bluefs->add_block_extent(false, devid, end, size-end); - string p = get_device_path(devid); - const char* path = p.c_str(); - if (path == nullptr) { - derr << devid - <<": can't find device path " << dendl; - continue; - } - if (bluefs->bdev_support_label(devid)) { - if (_set_bdev_label_size(p, size) >= 0) { - out << devid - << " : size label updated to " << size - << std::endl; - } + out << devid + <<" : expanding " << " to 0x" << size << std::dec << std::endl; + string p = get_device_path(devid); + const char* path = p.c_str(); + if (path == nullptr) { + derr << devid + <<": can't find device path " << dendl; + continue; + } + if (bluefs->bdev_support_label(devid)) { + if (_set_bdev_label_size(p, size) >= 0) { + out << devid + << " : size label updated to " << size + << std::endl; } } } @@ -6637,8 +6596,6 @@ int BlueStore::expand_devices(ostream& out) out << bluefs_layout.shared_bdev << " : expanding " << " from 0x" << std::hex << size0 << " to 0x" << size << std::dec << std::endl; - bluefs->add_block_extent(false, - bluefs_layout.shared_bdev, size0, size - size0); _write_out_fm_meta(size); if (bdev->supported_bdev_label()) { if (_set_bdev_label_size(path, size) >= 0) { @@ -8282,17 +8239,18 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) continue; } PExtentVector exts; - int64_t alloc_len = alloc->allocate(e->length, min_alloc_size, - 0, 0, &exts); + int64_t alloc_len = + shared_alloc.a->allocate(e->length, min_alloc_size, + 0, 0, &exts); if (alloc_len < 0 || alloc_len < (int64_t)e->length) { derr << __func__ << " failed to allocate 0x" << std::hex << e->length << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len) << " min_alloc_size 0x" << min_alloc_size - << " available 0x " << alloc->get_free() + << " available 0x " << shared_alloc.a->get_free() << std::dec << dendl; if (alloc_len > 0) { - alloc->release(exts); + shared_alloc.a->release(exts); } bypass_rest = true; break; @@ -8372,7 +8330,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) << "~" << it.get_len() << std::dec << dendl; fm->release(it.get_start(), it.get_len(), txn); } - alloc->release(to_release); + shared_alloc.a->release(to_release); to_release.clear(); } // if (it) { } //if (repair && repairer.preprocess_misreference()) { @@ -8634,7 +8592,7 @@ void BlueStore::inject_leaked(uint64_t len) txn = db->get_transaction(); PExtentVector exts; - int64_t alloc_len = alloc->allocate(len, min_alloc_size, + int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size, min_alloc_size * 256, 0, &exts); ceph_assert(alloc_len >= (int64_t)len); for (auto& p : exts) { @@ -8912,7 +8870,7 @@ void BlueStore::_get_statfs_overall(struct store_statfs_t *buf) db->estimate_prefix_size(PREFIX_OMAP, string()) + db->estimate_prefix_size(PREFIX_PERPOOL_OMAP, string()); - uint64_t bfree = alloc->get_free(); + uint64_t bfree = shared_alloc.a->get_free(); if (bluefs) { buf->internally_reserved = 0; @@ -10607,6 +10565,7 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( // write helpers uint64_t BlueStore::_get_ondisk_reserved() const { + ceph_assert(min_alloc_size); return round_up_to( std::max(SUPER_RESERVED, min_alloc_size), min_alloc_size); } @@ -11416,7 +11375,7 @@ void BlueStore::_txc_release_alloc(TransContext *txc) } dout(10) << __func__ << "(sync) " << txc << " " << std::hex << txc->released << std::dec << dendl; - alloc->release(txc->released); + shared_alloc.a->release(txc->released); } out: @@ -11923,7 +11882,7 @@ void BlueStore::_kv_finalize_thread() _reap_collections(); logger->set(l_bluestore_fragmentation, - (uint64_t)(alloc->get_fragmentation() * 1000)); + (uint64_t)(shared_alloc.a->get_fragmentation() * 1000)); log_latency("kv_final", l_bluestore_kv_final_lat, @@ -13579,17 +13538,17 @@ int BlueStore::_do_alloc_write( PExtentVector prealloc; prealloc.reserve(2 * wctx->writes.size());; int64_t prealloc_left = 0; - prealloc_left = alloc->allocate( + prealloc_left = shared_alloc.a->allocate( need, min_alloc_size, need, 0, &prealloc); if (prealloc_left < 0 || prealloc_left < (int64_t)need) { derr << __func__ << " failed to allocate 0x" << std::hex << need << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left) << " min_alloc_size 0x" << min_alloc_size - << " available 0x " << alloc->get_free() + << " available 0x " << shared_alloc.a->get_free() << std::dec << dendl; if (prealloc.size()) { - alloc->release(prealloc); + shared_alloc.a->release(prealloc); } return -ENOSPC; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index edd6c015fbde..2f0f80160fd5 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2051,7 +2051,9 @@ private: BlockDevice *bdev = nullptr; std::string freelist_type; FreelistManager *fm = nullptr; - Allocator *alloc = nullptr; + + bluefs_shared_alloc_context_t shared_alloc; + uuid_d fsid; int path_fd = -1; ///< open handle to $path int fsid_fd = -1; ///< open handle (locked) to $path/fsid @@ -2353,7 +2355,7 @@ private: int _minimal_open_bluefs(bool create); void _minimal_close_bluefs(); - int _open_bluefs(bool create); + int _open_bluefs(bool create, bool read_only); void _close_bluefs(bool cold_close); // Limited (u)mount intended for BlueFS operations only diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index 8b1a1d5cb397..4a2a8152c2d7 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -228,8 +228,6 @@ void bluefs_transaction_t::generate_test_instances( ls.push_back(new bluefs_transaction_t); ls.push_back(new bluefs_transaction_t); ls.back()->op_init(); - ls.back()->op_alloc_add(0, 0, 123123211); - ls.back()->op_alloc_rm(1, 0, 123); ls.back()->op_dir_create("dir"); ls.back()->op_dir_create("dir2"); bluefs_fnode_t fnode; diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 42bc6ebae6ed..eea4845349e8 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -185,8 +185,8 @@ struct bluefs_transaction_t { typedef enum { OP_NONE = 0, OP_INIT, ///< initial (empty) file system marker - OP_ALLOC_ADD, ///< add extent to available block storage (extent) - OP_ALLOC_RM, ///< remove extent from available block storage (extent) + OP_ALLOC_ADD, ///< OBSOLETE: add extent to available block storage (extent) + OP_ALLOC_RM, ///< OBSOLETE: remove extent from available block storage (extent) OP_DIR_LINK, ///< (re)set a dir entry (dirname, filename, ino) OP_DIR_UNLINK, ///< remove a dir entry (dirname, filename) OP_DIR_CREATE, ///< create a dir (dirname) @@ -214,20 +214,6 @@ struct bluefs_transaction_t { using ceph::encode; encode((__u8)OP_INIT, op_bl); } - void op_alloc_add(uint8_t id, uint64_t offset, uint64_t length) { - using ceph::encode; - encode((__u8)OP_ALLOC_ADD, op_bl); - encode(id, op_bl); - encode(offset, op_bl); - encode(length, op_bl); - } - void op_alloc_rm(uint8_t id, uint64_t offset, uint64_t length) { - using ceph::encode; - encode((__u8)OP_ALLOC_RM, op_bl); - encode(id, op_bl); - encode(offset, op_bl); - encode(length, op_bl); - } void op_dir_create(const std::string& dir) { using ceph::encode; encode((__u8)OP_DIR_CREATE, op_bl); @@ -284,5 +270,4 @@ struct bluefs_transaction_t { WRITE_CLASS_ENCODER(bluefs_transaction_t) std::ostream& operator<<(std::ostream& out, const bluefs_transaction_t& t); - #endif diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc index 326658e446a9..40900e38559f 100644 --- a/src/os/bluestore/bluestore_tool.cc +++ b/src/os/bluestore/bluestore_tool.cc @@ -161,7 +161,7 @@ void add_devices( cout << " -> " << target_path; } cout << std::endl; - int r = fs->add_block_device(e.second, e.first, false); + int r = fs->add_block_device(e.second, e.first, false, 0); // 'reserved' is fake if (r < 0) { cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl; exit(EXIT_FAILURE); @@ -169,13 +169,15 @@ void add_devices( } } -BlueFS *open_bluefs( +BlueFS *open_bluefs_readonly( CephContext *cct, const string& path, const vector& devs) { validate_path(cct, path, true); - BlueFS *fs = new BlueFS(cct); + // We provide no shared allocator which prevents bluefs to operate in R/W mode. + // Read-only mode isn't strictly enforced though + BlueFS *fs = new BlueFS(cct, nullptr); add_devices(fs, cct, devs); @@ -194,7 +196,9 @@ void log_dump( const vector& devs) { validate_path(cct, path, true); - BlueFS *fs = new BlueFS(cct); + // We provide no shared allocator which prevents bluefs to operate in R/W mode. + // Read-only mode isn't strictly enforced though + BlueFS *fs = new BlueFS(cct, nullptr); add_devices(fs, cct, devs); int r = fs->log_dump(); @@ -595,7 +599,7 @@ int main(int argc, char **argv) } } else if (action == "bluefs-export") { - BlueFS *fs = open_bluefs(cct.get(), path, devs); + BlueFS *fs = open_bluefs_readonly(cct.get(), path, devs); vector dirs; int r = fs->readdir("", &dirs); diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 427af756cd6b..add3f86c62c1 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -89,18 +89,16 @@ TEST(BlueFS, mkfs) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; uuid_d fsid; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); } TEST(BlueFS, mkfs_mount) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -110,74 +108,11 @@ TEST(BlueFS, mkfs_mount) { fs.umount(); } -TEST(BlueFS, mkfs_mount_duplicate_gift) { - uint64_t size = 1048576 * 128; - TempBdev bdev{ size }; - bluefs_extent_t dup_ext; - { - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); - uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); - ASSERT_EQ(0, fs.mount()); - - { - BlueFS::FileWriter *h; - ASSERT_EQ(0, fs.mkdir("dir")); - ASSERT_EQ(0, fs.open_for_write("dir", "file1", &h, false)); - h->append("foo", 3); - h->append("bar", 3); - h->append("baz", 3); - fs.fsync(h); - ceph_assert(h->file->fnode.extents.size() > 0); - dup_ext = h->file->fnode.extents[0]; - ceph_assert(dup_ext.bdev == BlueFS::BDEV_DB); - fs.close_writer(h); - } - - fs.umount(); - } - - { - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - ASSERT_EQ(0, fs.mount()); - // free allocation presumably allocated for file1 - std::cout << "duplicate extent: " << std::hex - << dup_ext.offset << "~" << dup_ext.length - << std::dec << std::endl; - fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, dup_ext.offset, dup_ext.length); - { - // overwrite file1 with file2 - BlueFS::FileWriter *h; - ASSERT_EQ(0, fs.open_for_write("dir", "file2", &h, false)); - h->append("foo", 3); - h->append("bar", 3); - h->append("baz", 3); - fs.fsync(h); - fs.close_writer(h); - } - fs.umount(); - } - - g_ceph_context->_conf.set_val_or_die("bluefs_log_replay_check_allocations", "true"); - g_ceph_context->_conf.apply_changes(nullptr); - - { - // this should fail - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - ASSERT_NE(0, fs.mount()); - } -} - TEST(BlueFS, write_read) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -206,9 +141,8 @@ TEST(BlueFS, write_read) { TEST(BlueFS, small_appends) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -239,14 +173,13 @@ TEST(BlueFS, very_large_write) { // we'll write a ~5G file, so allocate more than that for the whole fs uint64_t size = 1048576 * 1024 * 8ull; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); + BlueFS fs(g_ceph_context, nullptr); bool old = g_ceph_context->_conf.get_val("bluefs_buffered_io"); g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); uint64_t total_written = 0; - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -430,9 +363,8 @@ TEST(BlueFS, test_flush_1) { "65536"); g_ceph_context->_conf.apply_changes(nullptr); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -465,9 +397,8 @@ TEST(BlueFS, test_flush_2) { "65536"); g_ceph_context->_conf.apply_changes(nullptr); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -493,9 +424,8 @@ TEST(BlueFS, test_flush_3) { "65536"); g_ceph_context->_conf.apply_changes(nullptr); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -527,9 +457,8 @@ TEST(BlueFS, test_simple_compaction_sync) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -580,9 +509,8 @@ TEST(BlueFS, test_simple_compaction_async) { uint64_t size = 1048576 * 128; TempBdev bdev{size}; - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -636,9 +564,8 @@ TEST(BlueFS, test_compaction_sync) { "bluefs_compact_log_sync", "true"); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -674,9 +601,8 @@ TEST(BlueFS, test_compaction_async) { "bluefs_compact_log_sync", "false"); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -712,9 +638,8 @@ TEST(BlueFS, test_replay) { "bluefs_compact_log_sync", "false"); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); @@ -758,9 +683,8 @@ TEST(BlueFS, test_replay_growth) { conf.SetVal("bluefs_sync_write", "true"); conf.ApplyChanges(); - BlueFS fs(g_ceph_context); - ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); - fs.add_block_extent(true, BlueFS::BDEV_DB, 1048576, size - 1048576); + BlueFS fs(g_ceph_context, nullptr); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false, 1048576)); uuid_d fsid; ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount());