#include "BitmapFreelistManager.h"
#include "kv/KeyValueDB.h"
#include "os/kv.h"
+#include "include/stringify.h"
#include "common/debug.h"
_init_misc();
- blocks = size / bytes_per_block;
- if (blocks / blocks_per_key * blocks_per_key != blocks) {
- blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
+ blocks = size_2_block_count(size);
+ if (blocks * bytes_per_block > size) {
dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
<< " to 0x" << (blocks * bytes_per_block)
<< " (0x" << blocks << " blocks)" << std::dec << dendl;
return 0;
}
-int BitmapFreelistManager::expand(uint64_t new_size, KeyValueDB::Transaction txn)
+int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db)
{
- assert(new_size > size);
+ assert(old_size < size);
ceph_assert(isp2(bytes_per_block));
- uint64_t blocks0 = size / bytes_per_block;
- if (blocks0 / blocks_per_key * blocks_per_key != blocks0) {
- blocks0 = (blocks0 / blocks_per_key + 1) * blocks_per_key;
- dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
- << " to 0x" << (blocks0 * bytes_per_block)
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ auto blocks0 = size_2_block_count(old_size);
+ if (blocks0 * bytes_per_block > old_size) {
+ dout(10) << __func__ << " rounding1 blocks up from 0x" << std::hex
+ << old_size << " to 0x" << (blocks0 * bytes_per_block)
<< " (0x" << blocks0 << " blocks)" << std::dec << dendl;
- // reset previous past-eof blocks to unallocated
- _xor(size, blocks0 * bytes_per_block - size, txn);
+ // reset past-eof blocks to unallocated
+ _xor(old_size, blocks0 * bytes_per_block - old_size, txn);
}
- size = p2align(new_size, bytes_per_block);
- blocks = size / bytes_per_block;
+ size = p2align(size, bytes_per_block);
+ blocks = size_2_block_count(size);
- if (blocks / blocks_per_key * blocks_per_key != blocks) {
- blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
- dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
- << " to 0x" << (blocks * bytes_per_block)
+ if (blocks * bytes_per_block > size) {
+ dout(10) << __func__ << " rounding2 blocks up from 0x" << std::hex
+ << size << " to 0x" << (blocks * bytes_per_block)
<< " (0x" << blocks << " blocks)" << std::dec << dendl;
// set past-eof blocks as allocated
_xor(size, blocks * bytes_per_block - size, txn);
encode(size, bl);
txn->set(meta_prefix, "size", bl);
}
+ db->submit_transaction_sync(txn);
+
return 0;
}
-int BitmapFreelistManager::init(KeyValueDB *kvdb)
+int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB* kvdb,
+ uint64_t* res)
{
- dout(1) << __func__ << dendl;
+ bufferlist v;
+ int r = kvdb->get(meta_prefix, "size", &v);
+ if (r < 0) {
+ derr << __func__ << " missing size meta in DB" << dendl;
+ return ENOENT;
+ } else {
+ auto p = v.cbegin();
+ decode(*res, p);
+ r = 0;
+ }
+ return r;
+}
+void BitmapFreelistManager::_load_from_db(KeyValueDB* kvdb)
+{
KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
it->lower_bound(string());
auto p = bl.cbegin();
decode(bytes_per_block, p);
dout(10) << __func__ << " bytes_per_block 0x" << std::hex
- << bytes_per_block << std::dec << dendl;
+ << bytes_per_block << std::dec << dendl;
} else if (k == "blocks") {
bufferlist bl = it->value();
auto p = bl.cbegin();
decode(blocks, p);
dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
- << dendl;
+ << dendl;
} else if (k == "size") {
bufferlist bl = it->value();
auto p = bl.cbegin();
decode(size, p);
dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
- << dendl;
+ << dendl;
} else if (k == "blocks_per_key") {
bufferlist bl = it->value();
auto p = bl.cbegin();
decode(blocks_per_key, p);
dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
- << std::dec << dendl;
+ << std::dec << dendl;
} else {
derr << __func__ << " unrecognized meta " << k << dendl;
- return -EIO;
}
it->next();
}
+}
+
+
+int BitmapFreelistManager::init(const bluestore_bdev_label_t& label,
+ KeyValueDB *kvdb,
+ bool db_in_read_only)
+{
+ dout(1) << __func__ << dendl;
+ int r = _init_from_label(label);
+ if (r != 0) {
+ dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+ _load_from_db(kvdb);
+ }
+ _sync(kvdb, db_in_read_only);
dout(10) << __func__ << std::hex
<< " size 0x" << size
return 0;
}
+int BitmapFreelistManager::_init_from_label(const bluestore_bdev_label_t& label)
+{
+ dout(1) << __func__ << dendl;
+
+ int r = ENOENT;
+ string err;
+
+ auto it = label.meta.find("bfm_size");
+ auto end = label.meta.end();
+ if (it != end) {
+ size = strict_iecstrtoll(it->second.c_str(), &err);
+ if (!err.empty()) {
+ derr << __func__ << " Failed to parse - "
+ << it->first << ":" << it->second
+ << ", error: " << err << dendl;
+ return r;
+ }
+ } else {
+ // this is expected for legacy deployed OSDs
+ dout(0) << __func__ << " bfm_size not found in bdev meta" << dendl;
+ return r;
+ }
+
+ it = label.meta.find("bfm_blocks");
+ if (it != end) {
+ blocks = strict_iecstrtoll(it->second.c_str(), &err);
+ if (!err.empty()) {
+ derr << __func__ << " Failed to parse - "
+ << it->first << ":" << it->second
+ << ", error: " << err << dendl;
+ return r;
+ }
+ } else {
+ derr << __func__ << " bfm_blocks not found in bdev meta" << dendl;
+ return r;
+ }
+
+ it = label.meta.find("bfm_bytes_per_block");
+ if (it != end) {
+ bytes_per_block = strict_iecstrtoll(it->second.c_str(), &err);
+ if (!err.empty()) {
+ derr << __func__ << " Failed to parse - "
+ << it->first << ":" << it->second
+ << ", error: " << err << dendl;
+ return r;
+ }
+ } else {
+ derr << __func__ << " bfm_bytes_per_block not found in bdev meta" << dendl;
+ return r;
+ }
+ it = label.meta.find("bfm_blocks_per_key");
+ if (it != end) {
+ blocks_per_key = strict_iecstrtoll(it->second.c_str(), &err);
+ if (!err.empty()) {
+ derr << __func__ << " Failed to parse - "
+ << it->first << ":" << it->second
+ << ", error: " << err << dendl;
+ return r;
+ }
+ } else {
+ derr << __func__ << " bfm_blocks_per_key not found in bdev meta" << dendl;
+ return r;
+ }
+ r = 0;
+ return 0;
+}
+
void BitmapFreelistManager::_init_misc()
{
bufferptr z(blocks_per_key >> 3);
<< dendl;
}
+void BitmapFreelistManager::sync(KeyValueDB* kvdb)
+{
+ _sync(kvdb, true);
+}
+
+void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only)
+{
+ dout(10) << __func__ << " checks if size sync is needed" << dendl;
+ uint64_t size_db = 0;
+ int r = read_size_meta_from_db(kvdb, &size_db);
+ ceph_assert(r >= 0);
+ if (!read_only && size_db < size) {
+ dout(1) << __func__ << " committing new size 0x" << std::hex << size
+ << std::dec << dendl;
+ r = _expand(size_db, kvdb);
+ ceph_assert(r == 0);
+ } else if (size_db > size) {
+ // this might hapen when OSD passed the following sequence:
+ // upgrade -> downgrade -> expand -> upgrade
+ // One needs to run expand once again to syncup
+ dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+ _load_from_db(kvdb);
+ }
+}
+
void BitmapFreelistManager::shutdown()
{
dout(1) << __func__ << dendl;
}
}
}
+
+uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size) const
+{
+ auto target_blocks = target_size / bytes_per_block;
+ if (target_blocks / blocks_per_key * blocks_per_key != target_blocks) {
+ target_blocks = (target_blocks / blocks_per_key + 1) * blocks_per_key;
+ }
+ return target_blocks;
+}
+
+void BitmapFreelistManager::get_meta(
+ uint64_t target_size,
+ std::vector<std::pair<string, string>>* res) const
+{
+ if (target_size == 0) {
+ res->emplace_back("bfm_blocks", stringify(blocks));
+ res->emplace_back("bfm_size", stringify(size));
+ } else {
+ target_size = p2align(target_size, bytes_per_block);
+ auto target_blocks = size_2_block_count(target_size);
+
+ res->emplace_back("bfm_blocks", stringify(target_blocks));
+ res->emplace_back("bfm_size", stringify(target_size));
+ }
+ res->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block));
+ res->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key));
+}
if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
<< " and fsid " << fsid << " check bypassed" << dendl;
- }
- else if (label.osd_uuid != fsid) {
+ } else if (label.osd_uuid != fsid) {
derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
<< " does not match our fsid " << fsid << dendl;
return -EIO;
bdev = NULL;
}
-int BlueStore::_open_fm(KeyValueDB::Transaction t)
+int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
{
+ int r;
+ bluestore_bdev_label_t label;
+
ceph_assert(fm == NULL);
fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
ceph_assert(fm);
start += l + u;
}
}
+ r = _write_out_fm_meta(0, false, &label);
+ ceph_assert(r == 0);
+ } else {
+ string p = path + "/block";
+ r = _read_bdev_label(cct, p, &label);
+ if (r < 0) {
+ derr << __func__ << " freelist init failed, error reading bdev label: " << cpp_strerror(r) << dendl;
+ delete fm;
+ fm = NULL;
+ return r;
+ }
}
-
- int r = fm->init(db);
+ r = fm->init(label, db, read_only);
if (r < 0) {
derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
delete fm;
fm = NULL;
}
+int BlueStore::_write_out_fm_meta(uint64_t target_size,
+ bool update_root_size,
+ bluestore_bdev_label_t* res_label)
+{
+ string p = path + "/block";
+
+ std::vector<std::pair<string, string>> fm_meta;
+ fm->get_meta(target_size, &fm_meta);
+
+ bluestore_bdev_label_t label;
+ int r = _read_bdev_label(cct, p, &label);
+ if (r < 0)
+ return r;
+
+ for (auto& m : fm_meta) {
+ label.meta[m.first] = m.second;
+ }
+ if (update_root_size) {
+ label.size = target_size;
+ }
+ r = _write_bdev_label(cct, p, label);
+ if (res_label) {
+ *res_label = label;
+ }
+
+ return r;
+}
+
int BlueStore::_open_alloc()
{
ceph_assert(alloc == NULL);
goto out_db;
}
- r = _open_fm(nullptr);
+ r = _open_fm(nullptr, true);
if (r < 0)
goto out_db;
_close_fm();
return r;
}
+ fm->sync(db);
}
} else {
r = _open_db(false, false);
goto out_db;
}
- r = _open_fm(nullptr);
+ r = _open_fm(nullptr, false);
if (r < 0)
goto out_db;
{
KeyValueDB::Transaction t = db->get_transaction();
- r = _open_fm(t);
+ r = _open_fm(t, true);
if (r < 0)
goto out_close_db;
{
}
uint64_t size0 = fm->get_size();
uint64_t size = bdev->get_size();
- cold_close();
if (size0 < size) {
- out << "Expanding Main..." << std::endl;
- int r = _mount(false);
- ceph_assert(r == 0);
-
out << bluefs_layout.shared_bdev
- <<" : expanding " << " from 0x" << std::hex
- << size0 << " to 0x" << size << std::dec << std::endl;
- KeyValueDB::Transaction txn;
- txn = db->get_transaction();
- r = fm->expand(size, txn);
+ << " : expanding " << " from 0x" << std::hex
+ << size0 << " to 0x" << size << std::dec << std::endl;
+ _write_out_fm_meta(size, true);
+ cold_close();
+
+ // mount in read/write to sync expansion changes
+ r = _mount(false);
ceph_assert(r == 0);
- db->submit_transaction_sync(txn);
-
- // always reference to slow device here
- string p = get_device_path(BlueFS::BDEV_SLOW);
- ceph_assert(!p.empty());
- const char* path = p.c_str();
- bluestore_bdev_label_t label;
- r = _read_bdev_label(cct, path, &label);
- if (r < 0) {
- derr << "unable to read label for " << path << ": "
- << cpp_strerror(r) << dendl;
- } else {
- label.size = size;
- r = _write_bdev_label(cct, path, label);
- if (r < 0) {
- derr << "unable to write label for " << path << ": "
- << cpp_strerror(r) << dendl;
- } else {
- out << bluefs_layout.shared_bdev
- <<" : size label updated to " << size
- << std::endl;
- }
- }
umount();
+ } else {
+ cold_close();
}
return r;
}
int r = db->submit_transaction_sync(t);
ceph_assert(r == 0);
}
+ if (ondisk_format == 3) {
+ // changes:
+ // - FreelistManager keeps meta within bdev label
+ int r = _write_out_fm_meta(0);
+ ceph_assert(r == 0);
+
+ ondisk_format = 4;
+ }
}
// done
dout(1) << __func__ << " done" << dendl;