ceph-bluestore-tool.
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
// write supers
super.log_fnode = log_file->fnode;
- _write_super();
+ _write_super(BDEV_DB);
flush_bdev();
// clean up
_shutdown_logger();
}
+int BlueFS::prepare_new_device(int id)
+{
+ dout(1) << __func__ << dendl;
+
+ if(id == BDEV_NEWDB) {
+ int new_log_dev_cur = BDEV_WAL;
+ int new_log_dev_next = BDEV_WAL;
+ if (!bdev[BDEV_WAL]) {
+ new_log_dev_cur = BDEV_NEWDB;
+ new_log_dev_next = BDEV_DB;
+ }
+ _rewrite_log_sync(false,
+ BDEV_NEWDB,
+ new_log_dev_cur,
+ new_log_dev_next,
+ RENAME_DB2SLOW);
+ //}
+ } else if(id == BDEV_NEWWAL) {
+ _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL);
+ } else {
+ assert(false);
+ }
+ return 0;
+}
+
void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
{
if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
return 0;
}
-int BlueFS::_write_super()
+int BlueFS::_write_super(int dev)
{
// build superblock
bufferlist bl;
ceph_assert(bl.length() <= get_super_length());
bl.append_zero(get_super_length() - bl.length());
- bdev[BDEV_DB]->write(get_super_offset(), bl, false);
+ bdev[dev]->write(get_super_offset(), bl, false);
dout(20) << __func__ << " v " << super.version
<< " crc 0x" << std::hex << crc
<< " offset 0x" << get_super_offset() << std::dec
return 0;
}
+int BlueFS::device_migrate_to_existing(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target)
+{
+ vector<byte> buf;
+ bool buffered = cct->_conf->bluefs_buffered_io;
+
+ assert(dev_target < (int)MAX_BDEV);
+
+ int flags = 0;
+ flags |= devs_source.count(BDEV_DB) ?
+ (REMOVE_DB | RENAME_SLOW2DB) : 0;
+ flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+ int dev_target_new = dev_target;
+
+ // Slow device without separate DB one is addressed via BDEV_DB
+ // Hence need renaming.
+ if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
+ dev_target_new = BDEV_DB;
+ dout(0) << __func__ << " super to be written to " << dev_target << dendl;
+ }
+
+ for (auto& p : file_map) {
+ //do not copy log
+ if (p.second->fnode.ino == 1) {
+ continue;
+ }
+ auto& fnode_extents = p.second->fnode.extents;
+
+ for (auto ext_it = fnode_extents.begin();
+ ext_it != p.second->fnode.extents.end();
+ ++ext_it) {
+ if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+ bluefs_extent_t old_ext = *ext_it;
+ PExtentVector extents;
+ auto l =
+ _allocate_without_fallback(dev_target, old_ext.length, &extents);
+ if (l == 0) {
+ buf.resize(old_ext.length);
+ int r = bdev[old_ext.bdev]->read_random(
+ old_ext.offset,
+ old_ext.length,
+ (char*)&buf.at(0),
+ buffered);
+ if (r != 0) {
+ derr << __func__ << " failed to read 0x" << std::hex
+ << old_ext.offset << "~" <<old_ext.length << std::dec
+ << " from " << (int)dev_target << dendl;
+ return -EIO;
+ }
+
+ assert(extents.size() > 0);
+ uint64_t src_buf_pos = 0;
+ {
+ // overwrite existing extent
+ *ext_it=
+ bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
+ bufferlist bl;
+ bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
+ int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
+ ceph_assert(r == 0);
+ src_buf_pos += extents[0].length;
+ }
+ // then insert more extents if needed
+ for( size_t i = 1; i < extents.size(); ++i) {
+ bufferlist bl;
+ bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
+ ++ext_it;
+ ext_it = fnode_extents.emplace(ext_it, dev_target_new,
+ extents[i].offset, extents[i].length);
+ int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
+ ceph_assert(r == 0);
+ src_buf_pos += extents[i].length;
+ }
+ {
+ PExtentVector to_release;
+ to_release.emplace_back(old_ext.offset, old_ext.length);
+ alloc[old_ext.bdev]->release(to_release);
+ }
+
+ } else {
+ derr << __func__ << " unable to allocate len 0x" << std::hex
+ << old_ext.length << std::dec << " from " << (int)dev_target
+ << dendl;
+ return -ENOSPC;
+ }
+ } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
+ ext_it->bdev = dev_target_new;
+ }
+ }
+ auto& prefer_bdev = p.second->fnode.prefer_bdev;
+ if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
+ prefer_bdev = dev_target_new;
+ }
+ }
+ // new logging device in the current naming scheme
+ int new_log_dev_cur = bdev[BDEV_WAL] ?
+ BDEV_WAL :
+ bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
+
+ // new logging device in new naming scheme
+ int new_log_dev_next = new_log_dev_cur;
+
+ if (devs_source.count(new_log_dev_cur)) {
+ // SLOW device is addressed via BDEV_DB too hence either WAL or DB
+ new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
+ BDEV_DB :
+ BDEV_WAL;
+
+ dout(0) << __func__ << " log moved from " << new_log_dev_cur
+ << " to " << new_log_dev_next << dendl;
+
+ new_log_dev_cur =
+ (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
+ BDEV_SLOW :
+ new_log_dev_next;
+ }
+
+ _rewrite_log_sync(
+ false,
+ (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
+ new_log_dev_cur,
+ new_log_dev_next,
+ flags);
+ return 0;
+}
+
+int BlueFS::device_migrate_to_new(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target)
+{
+ vector<byte> buf;
+ bool buffered = cct->_conf->bluefs_buffered_io;
+
+ assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
+
+ int flags = 0;
+
+ flags |= devs_source.count(BDEV_DB) ?
+ (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
+ 0;
+ flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+ int dev_target_new = dev_target;
+
+ for (auto& p : file_map) {
+ //do not copy log
+ if (p.second->fnode.ino == 1) {
+ continue;
+ }
+ auto& fnode_extents = p.second->fnode.extents;
+
+ for (auto ext_it = fnode_extents.begin();
+ ext_it != p.second->fnode.extents.end();
+ ++ext_it) {
+ if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+ bluefs_extent_t old_ext = *ext_it;
+ PExtentVector extents;
+ auto l =
+ _allocate_without_fallback(dev_target, old_ext.length, &extents);
+ if (l == 0) {
+ buf.resize(old_ext.length);
+ int r = bdev[old_ext.bdev]->read_random(
+ old_ext.offset,
+ old_ext.length,
+ (char*)&buf.at(0),
+ buffered);
+ dout(10)<<__func__<<" read = "<<r<<dendl;
+ if (r != 0) {
+ derr << __func__ << " failed to read 0x" << std::hex
+ << old_ext.offset << "~" <<old_ext.length << std::dec
+ << " from " << (int)dev_target << dendl;
+ return -EIO;
+ }
+
+ assert(extents.size() > 0);
+ uint64_t src_buf_pos = 0;
+ {
+ // overwrite existing extent
+ *ext_it=
+ bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length);
+ bufferlist bl;
+ bl.append((char*)&buf.at(src_buf_pos), extents[0].length);
+ int r = bdev[dev_target]->write(extents[0].offset, bl, buffered);
+ ceph_assert(r == 0);
+ src_buf_pos += extents[0].length;
+ }
+ // then insert more extents if needed
+ for( size_t i = 1; i < extents.size(); ++i) {
+ bufferlist bl;
+ bl.append((char*)&buf.at(src_buf_pos), extents[i].length);
+ ++ext_it;
+ ext_it = fnode_extents.emplace(ext_it, dev_target_new,
+ extents[i].offset, extents[i].length);
+ int r = bdev[dev_target]->write(extents[i].offset, bl, buffered);
+ ceph_assert(r == 0);
+ src_buf_pos += extents[i].length;
+ }
+ {
+ PExtentVector to_release;
+ to_release.emplace_back(old_ext.offset, old_ext.length);
+ alloc[old_ext.bdev]->release(to_release);
+ }
+ } else {
+ derr << __func__ << " unable to allocate len 0x" << std::hex
+ << old_ext.length << std::dec << " from " << (int)dev_target
+ << dendl;
+ return -ENOSPC;
+ }
+ } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) {
+ ext_it->bdev = dev_target_new;
+ }
+ }
+ auto& prefer_bdev = p.second->fnode.prefer_bdev;
+ if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) {
+ prefer_bdev = dev_target_new;
+ }
+ }
+ // new logging device in the current naming scheme
+ int new_log_dev_cur =
+ bdev[BDEV_NEWWAL] ?
+ BDEV_NEWWAL :
+ bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
+ BDEV_WAL :
+ bdev[BDEV_NEWDB] ?
+ BDEV_NEWDB :
+ bdev[BDEV_DB] && !(flags & REMOVE_DB)?
+ BDEV_DB :
+ BDEV_SLOW;
+
+ // new logging device in new naming scheme
+ int new_log_dev_next =
+ new_log_dev_cur == BDEV_NEWWAL ?
+ BDEV_WAL :
+ new_log_dev_cur == BDEV_NEWDB ?
+ BDEV_DB :
+ new_log_dev_cur;
+
+ int super_dev =
+ dev_target == BDEV_NEWDB ?
+ BDEV_NEWDB :
+ bdev[BDEV_DB] ?
+ BDEV_DB :
+ BDEV_SLOW;
+
+ _rewrite_log_sync(
+ false,
+ super_dev,
+ new_log_dev_cur,
+ new_log_dev_next,
+ flags);
+ return 0;
+}
+
BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
{
auto p = file_map.find(ino);
return true;
}
-void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t)
+void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
+ int flags)
{
t->seq = 1;
t->uuid = super.uuid;
for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
+ auto bdev_new = bdev;
+ if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
+ continue;
+ }
+ if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
+ continue;
+ }
+ if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+ bdev_new = BDEV_DB;
+ }
+ if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+ bdev_new = BDEV_SLOW;
+ }
+ if (bdev == BDEV_NEWDB) {
+ // REMOVE_DB xor RENAME_DB
+ ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+ ceph_assert(!(flags & RENAME_SLOW2DB));
+ bdev_new = BDEV_DB;
+ }
+ if (bdev == BDEV_NEWWAL) {
+ ceph_assert(flags & REMOVE_WAL);
+ bdev_new = BDEV_WAL;
+ }
+ dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
<< std::hex << q.get_start() << "~" << q.get_len() << std::dec
<< dendl;
- t->op_alloc_add(bdev, q.get_start(), q.get_len());
+ t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
}
}
for (auto& p : file_map) {
if (p.first == 1)
continue;
- dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
ceph_assert(p.first > 1);
+
+ for(auto& e : p.second->fnode.extents) {
+ auto bdev = e.bdev;
+ auto bdev_new = bdev;
+ ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
+ if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+ bdev_new = BDEV_DB;
+ }
+ if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+ bdev_new = BDEV_SLOW;
+ }
+ if (bdev == BDEV_NEWDB) {
+ // REMOVE_DB xor RENAME_DB
+ ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+ ceph_assert(!(flags & RENAME_SLOW2DB));
+ bdev_new = BDEV_DB;
+ }
+ if (bdev == BDEV_NEWWAL) {
+ ceph_assert(flags & REMOVE_WAL);
+ bdev_new = BDEV_WAL;
+ }
+ e.bdev = bdev_new;
+ }
+ dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
t->op_file_update(p.second->fnode);
}
for (auto& p : dir_map) {
void BlueFS::_compact_log_sync()
{
dout(10) << __func__ << dendl;
+ _rewrite_log_sync(true,
+ BDEV_DB,
+ log_writer->file->fnode.prefer_bdev,
+ log_writer->file->fnode.prefer_bdev,
+ 0);
+ logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_rewrite_log_sync(bool allocate_with_fallback,
+ int super_dev,
+ int log_dev,
+ int log_dev_new,
+ int flags)
+{
File *log_file = log_writer->file.get();
// clear out log (be careful who calls us!!!)
log_t.clear();
+ dout(20) << __func__ << " super_dev:" << super_dev
+ << " log_dev:" << log_dev
+ << " log_dev_new:" << log_dev_new
+ << " flags:" << flags
+ << dendl;
bluefs_transaction_t t;
- _compact_log_dump_metadata(&t);
+ _compact_log_dump_metadata(&t, flags);
dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
t.op_jump_seq(log_seq);
mempool::bluefs::vector<bluefs_extent_t> old_extents;
uint64_t old_allocated = 0;
+ int r;
log_file->fnode.swap_extents(old_extents, old_allocated);
- int r = _allocate(log_file->fnode.prefer_bdev, need, &log_file->fnode);
- ceph_assert(r == 0);
+ if (allocate_with_fallback) {
+ r = _allocate(log_dev, need, &log_file->fnode);
+ ceph_assert(r == 0);
+ } else {
+ PExtentVector extents;
+ r = _allocate_without_fallback(log_dev,
+ need,
+ &extents);
+ ceph_assert(r == 0);
+ for (auto& p : extents) {
+ log_file->fnode.append_extent(
+ bluefs_extent_t(log_dev, p.offset, p.length));
+ }
+ }
_close_writer(log_writer);
#endif
flush_bdev();
- dout(10) << __func__ << " writing super" << dendl;
super.log_fnode = log_file->fnode;
+ // rename device if needed
+ if (log_dev != log_dev_new) {
+ dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
+ for (auto& p : super.log_fnode.extents) {
+ p.bdev = log_dev_new;
+ }
+ }
+ dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+
++super.version;
- _write_super();
+ _write_super(super_dev);
flush_bdev();
dout(10) << __func__ << " release old log extents " << old_extents << dendl;
for (auto& r : old_extents) {
pending_release[r.bdev].insert(r.offset, r.length);
}
-
- logger->inc(l_bluefs_log_compactions);
}
/*
bluefs_transaction_t t;
//avoid record two times in log_t and _compact_log_dump_metadata.
log_t.clear();
- _compact_log_dump_metadata(&t);
+ _compact_log_dump_metadata(&t, 0);
// conservative estimate for final encoded size
new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
dout(10) << __func__ << " writing super" << dendl;
super.log_fnode = log_file->fnode;
++super.version;
- _write_super();
+ _write_super(BDEV_DB);
lock.unlock();
flush_bdev();
logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
for (unsigned i = 0; i < MAX_BDEV; ++i) {
if (bdev[i]) {
- ceph_assert(h->iocv[i]);
- if (h->iocv[i]->has_pending_aios()) {
+ if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
bdev[i]->aio_submit(h->iocv[i]);
}
}
}
}
+int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
+ PExtentVector* extents)
+{
+ dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+ << " from " << (int)id << dendl;
+ assert(id < alloc.size());
+ uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size;
+
+ uint64_t left = round_up_to(len, min_alloc_size);
+
+ if (!alloc[id]) {
+ return -ENOENT;
+ }
+ extents->reserve(4); // 4 should be (more than) enough for most allocations
+ int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
+ if (alloc_len < (int64_t)left) {
+ if (alloc_len != 0) {
+ alloc[id]->release(*extents);
+ }
+ if (bdev[id])
+ derr << __func__ << " failed to allocate 0x" << std::hex << left
+ << " on bdev " << (int)id
+ << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
+ else
+ derr << __func__ << " failed to allocate 0x" << std::hex << left
+ << " on bdev " << (int)id << ", dne" << std::dec << dendl;
+ if (alloc[id])
+ alloc[id]->dump();
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
int BlueFS::_allocate(uint8_t id, uint64_t len,
bluefs_fnode_t* node)
{
dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
for (unsigned i=0; i<MAX_BDEV; ++i) {
if (bdev[i]) {
- ceph_assert(h->iocv[i]);
- h->iocv[i]->aio_wait();
- bdev[i]->queue_reap_ioc(h->iocv[i]);
+ if (h->iocv[i]) {
+ h->iocv[i]->aio_wait();
+ bdev[i]->queue_reap_ioc(h->iocv[i]);
+ }
}
}
delete h;
class BlueFS {
public:
CephContext* cct;
- static constexpr unsigned MAX_BDEV = 3;
+ static constexpr unsigned MAX_BDEV = 5;
static constexpr unsigned BDEV_WAL = 0;
static constexpr unsigned BDEV_DB = 1;
static constexpr unsigned BDEV_SLOW = 2;
+ static constexpr unsigned BDEV_NEWWAL = 3;
+ static constexpr unsigned BDEV_NEWDB = 4;
enum {
WRITER_UNKNOWN,
int _allocate(uint8_t bdev, uint64_t len,
bluefs_fnode_t* node);
+ int _allocate_without_fallback(uint8_t id, uint64_t len,
+ PExtentVector* extents);
+
int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
int _flush(FileWriter *h, bool force);
int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
uint64_t jump_to = 0);
uint64_t _estimate_log_size();
bool _should_compact_log();
- void _compact_log_dump_metadata(bluefs_transaction_t *t);
+
+ enum {
+ REMOVE_DB = 1,
+ REMOVE_WAL = 2,
+ RENAME_SLOW2DB = 4,
+ RENAME_DB2SLOW = 8,
+ };
+ void _compact_log_dump_metadata(bluefs_transaction_t *t,
+ int flags);
void _compact_log_sync();
void _compact_log_async(std::unique_lock<std::mutex>& l);
+ void _rewrite_log_sync(bool allocate_with_fallback,
+ int super_dev,
+ int log_dev,
+ int new_log_dev,
+ int flags);
+
//void _aio_finish(void *priv);
void _flush_bdev_safely(FileWriter *h);
void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
int _open_super();
- int _write_super();
+ int _write_super(int dev);
int _replay(bool noop, bool to_stdout = false); ///< replay journal
FileWriter *_create_writer(FileRef f);
int mkfs(uuid_d osd_uuid);
int mount();
void umount();
+ int prepare_new_device(int id);
int log_dump();
void get_devices(set<string> *ls);
int fsck();
+ int device_migrate_to_new(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target);
+ int device_migrate_to_existing(
+ CephContext *cct,
+ const set<int>& devs_source,
+ int dev_target);
+
uint64_t get_used();
uint64_t get_total(unsigned id);
uint64_t get_free(unsigned id);
return ret;
}
+int BlueStore::_open_bluefs(bool create)
+{
+ int r;
+ bluefs = new BlueFS(cct);
+
+ string bfn;
+ struct stat st;
+
+ bfn = path + "/block.db";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
+ create && cct->_conf->bdev_enable_discard);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB),
+ "bluefs db", create);
+ if (r < 0) {
+ derr << __func__
+ << " check block device(" << bfn << ") label returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+ if (create) {
+ bluefs->add_block_extent(
+ BlueFS::BDEV_DB,
+ SUPER_RESERVED,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
+ }
+ bluefs_shared_bdev = BlueFS::BDEV_SLOW;
+ bluefs_single_shared_device = false;
+ } else {
+ r = -errno;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ r = 0;
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ // shared device
+ bfn = path + "/block";
+ // never trim here
+ r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ if (create) {
+ // note: we always leave the first SUPER_RESERVED (8k) of the device unused
+ uint64_t initial =
+ bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
+ cct->_conf->bluestore_bluefs_gift_ratio);
+ initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
+ if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
+ derr << __func__ << " bluefs_alloc_size 0x" << std::hex
+ << cct->_conf->bluefs_alloc_size << " is not a multiple of "
+ << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ r = -EINVAL;
+ goto free_bluefs;
+ }
+ // align to bluefs's alloc_size
+ initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
+ // put bluefs in the middle of the device in case it is an HDD
+ uint64_t start = p2align((bdev->get_size() - initial) / 2,
+ cct->_conf->bluefs_alloc_size);
+ //avoiding superblock overwrite
+ ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
+ start = std::max(cct->_conf->bluefs_alloc_size, start);
+
+ bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
+ bluefs_extents.insert(start, initial);
+ }
+
+ bfn = path + "/block.wal";
+ if (::stat(bfn.c_str(), &st) == 0) {
+ r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+ create && cct->_conf->bdev_enable_discard);
+ if (r < 0) {
+ derr << __func__ << " add block device(" << bfn << ") returned: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+ "bluefs wal", create);
+ if (r < 0) {
+ derr << __func__ << " check block device(" << bfn
+ << ") label returned: " << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ if (create) {
+ bluefs->add_block_extent(
+ BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
+ BDEV_LABEL_BLOCK_SIZE);
+ }
+ kv_options["separate_wal_dir"] = "1";
+ bluefs_single_shared_device = false;
+ } else {
+ r = -errno;
+ if (::lstat(bfn.c_str(), &st) == -1) {
+ kv_options.erase("separate_wal_dir");
+ r = 0;
+ } else {
+ derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+ << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+ }
+
+ if (create) {
+ bluefs->mkfs(fsid);
+ }
+ r = bluefs->mount();
+ if (r < 0) {
+ derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+ goto free_bluefs;
+ }
+
+ return 0;
+free_bluefs:
+ ceph_assert(bluefs);
+ delete bluefs;
+ bluefs = NULL;
+ return r;
+}
+
+void BlueStore::_close_bluefs()
+{
+ bluefs->umount();
+ delete bluefs;
+ bluefs = NULL;
+}
int BlueStore::_open_db(bool create, bool to_repair_db)
{
int r;
derr << " backend must be rocksdb to use bluefs" << dendl;
return -EINVAL;
}
- bluefs = new BlueFS(cct);
-
- string bfn;
- struct stat st;
-
- bfn = path + "/block.db";
- if (::stat(bfn.c_str(), &st) == 0) {
- r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn,
- create && cct->_conf->bdev_enable_discard);
- if (r < 0) {
- derr << __func__ << " add block device(" << bfn << ") returned: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
-
- if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
- r = _check_or_set_bdev_label(
- bfn,
- bluefs->get_block_device_size(BlueFS::BDEV_DB),
- "bluefs db", create);
- if (r < 0) {
- derr << __func__
- << " check block device(" << bfn << ") label returned: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- }
- if (create) {
- bluefs->add_block_extent(
- BlueFS::BDEV_DB,
- SUPER_RESERVED,
- bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
- }
- bluefs_shared_bdev = BlueFS::BDEV_SLOW;
- bluefs_single_shared_device = false;
- } else {
- r = -errno;
- if (::lstat(bfn.c_str(), &st) == -1) {
- r = 0;
- bluefs_shared_bdev = BlueFS::BDEV_DB;
- } else {
- derr << __func__ << " " << bfn << " symlink exists but target unusable: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- }
-
- // shared device
- bfn = path + "/block";
- // never trim here
- r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false);
- if (r < 0) {
- derr << __func__ << " add block device(" << bfn << ") returned: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- if (create) {
- // note: we always leave the first SUPER_RESERVED (8k) of the device unused
- uint64_t initial =
- bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio +
- cct->_conf->bluestore_bluefs_gift_ratio);
- initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
- if (cct->_conf->bluefs_alloc_size % min_alloc_size) {
- derr << __func__ << " bluefs_alloc_size 0x" << std::hex
- << cct->_conf->bluefs_alloc_size << " is not a multiple of "
- << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
- r = -EINVAL;
- goto free_bluefs;
- }
- // align to bluefs's alloc_size
- initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);
- // put bluefs in the middle of the device in case it is an HDD
- uint64_t start = p2align((bdev->get_size() - initial) / 2,
- cct->_conf->bluefs_alloc_size);
- //avoiding superblock overwrite
- ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved());
- start = std::max(cct->_conf->bluefs_alloc_size, start);
-
- bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
- bluefs_extents.insert(start, initial);
- }
-
- bfn = path + "/block.wal";
- if (::stat(bfn.c_str(), &st) == 0) {
- r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
- create && cct->_conf->bdev_enable_discard);
- if (r < 0) {
- derr << __func__ << " add block device(" << bfn << ") returned: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
- r = _check_or_set_bdev_label(
- bfn,
- bluefs->get_block_device_size(BlueFS::BDEV_WAL),
- "bluefs wal", create);
- if (r < 0) {
- derr << __func__ << " check block device(" << bfn
- << ") label returned: " << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- }
+ r = _open_bluefs(create);
- if (create) {
- bluefs->add_block_extent(
- BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
- bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
- BDEV_LABEL_BLOCK_SIZE);
- }
- kv_options["separate_wal_dir"] = "1";
- bluefs_single_shared_device = false;
- } else {
- r = -errno;
- if (::lstat(bfn.c_str(), &st) == -1) {
- kv_options.erase("separate_wal_dir");
- r = 0;
- } else {
- derr << __func__ << " " << bfn << " symlink exists but target unusable: "
- << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
- }
-
- if (create) {
- bluefs->mkfs(fsid);
- }
- r = bluefs->mount();
- if (r < 0) {
- derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
- goto free_bluefs;
- }
if (cct->_conf->bluestore_bluefs_env_mirror) {
rocksdb::Env *a = new BlueRocksEnv(bluefs);
rocksdb::Env *b = rocksdb::Env::Default();
if (!db) {
derr << __func__ << " error creating db" << dendl;
if (bluefs) {
- bluefs->umount();
- delete bluefs;
- bluefs = NULL;
+ _close_bluefs();
}
// delete env manually here since we can't depend on db to do this
// under this case
dout(1) << __func__ << " opened " << kv_backend
<< " path " << fn << " options " << options << dendl;
return 0;
-
-free_bluefs:
- ceph_assert(bluefs);
- delete bluefs;
- bluefs = NULL;
- return r;
}
void BlueStore::_close_db()
delete db;
db = NULL;
if (bluefs) {
- bluefs->umount();
- delete bluefs;
- bluefs = NULL;
+ _close_bluefs();
}
}
<< bluefs_extents << std::dec << dendl;
synct->set(PREFIX_SUPER, "bluefs_extents", bl);
- int r = db->submit_transaction_sync(synct);
- assert(r == 0);
+ synct->set(PREFIX_SUPER, "bluefs_extents_back", bl);
+ int r = db->submit_transaction_sync(synct);
+ ceph_assert(r == 0);
}
return 0;
}
if (alloc_len <= 0) {
dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
- << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size
+ << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
<< std::dec << dendl;
_dump_alloc_on_rebalance_failure();
return 0;
} else if (alloc_len < (int64_t)gift) {
dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
- << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size
+ << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size
<< " allocated 0x" << alloc_len
<< std::dec << dendl;
_dump_alloc_on_rebalance_failure();
return r;
}
+int BlueStore::_mount_for_bluefs()
+{
+ int r = _open_path();
+ ceph_assert(r == 0);
+ r = _open_fsid(false);
+ ceph_assert(r == 0);
+ r = _read_fsid(&fsid);
+ ceph_assert(r == 0);
+ r = _lock_fsid();
+ ceph_assert(r == 0);
+ r = _open_bluefs(false);
+ ceph_assert(r == 0);
+ return r;
+}
+
+void BlueStore::_umount_for_bluefs()
+{
+ _close_bluefs();
+ _close_fsid();
+ _close_path();
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ int r;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ r = _mount_for_bluefs();
+
+ int reserved;
+ if (id == BlueFS::BDEV_NEWWAL) {
+ string p = path + "/block.wal";
+ r = _setup_block_symlink_or_file("block.wal", dev_path,
+ cct->_conf->bluestore_block_wal_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+
+ reserved = BDEV_LABEL_BLOCK_SIZE;
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ string p = path + "/block.db";
+ r = _setup_block_symlink_or_file("block.db", dev_path,
+ cct->_conf->bluestore_block_db_size,
+ true);
+ ceph_assert(r == 0);
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ p,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = SUPER_RESERVED;
+ }
+
+ bluefs->umount();
+ bluefs->mount();
+
+ bluefs->add_block_extent(
+ id,
+ reserved,
+ bluefs->get_block_device_size(id) - reserved);
+
+ r = bluefs->prepare_new_device(id);
+ ceph_assert(r == 0);
+
+ if (r < 0) {
+ derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+ } else {
+ dout(0) << __func__ << " success" << dendl;
+ }
+
+ _umount_for_bluefs();
+ return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+ int id)
+{
+ dout(10) << __func__ << " id:" << id << dendl;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ int r = _mount_for_bluefs();
+
+ // require bluestore_bluefs_min_free to be free at target device!
+ uint64_t used_space = cct->_conf.get_val<Option::size_t>("bluestore_bluefs_min_free");
+ for(auto src_id : devs_source) {
+ used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id);
+ }
+ uint64_t target_free = bluefs->get_free(id);
+ if (id == BlueFS::BDEV_SLOW && target_free < used_space) {
+ // will need to remount full BlueStore instance to allocate more space
+ _umount_for_bluefs();
+
+ r = mount();
+ ceph_assert(r == 0);
+ dout(1) << __func__
+ << " Allocating more space at slow device for BlueFS: +"
+ << used_space - target_free << " bytes" << dendl;
+ r = allocate_bluefs_freespace(used_space - target_free);
+ umount();
+ if (r != 0) {
+ derr << __func__
+ << " can't migrate, unable to allocate extra space: "
+ << used_space - target_free << " at target:" << id
+ << dendl;
+ return -ENOSPC;
+ }
+
+ r = _mount_for_bluefs();
+ ceph_assert(r == 0);
+ } else if (target_free < used_space) {
+ derr << __func__
+ << " can't migrate, free space at target: " << target_free
+ << " is less than required space: " << used_space
+ << dendl;
+ return -ENOSPC;
+ }
+ r = bluefs->device_migrate_to_existing(cct, devs_source, id);
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ goto shutdown;
+ }
+
+ if (devs_source.count(BlueFS::BDEV_DB)) {
+ r = unlink(string(path + "/block.db").c_str());
+ ceph_assert(r == 0);
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ r = unlink(string(path + "/block.wal").c_str());
+ ceph_assert(r == 0);
+ }
+
+shutdown:
+ _umount_for_bluefs();
+ return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+ int id,
+ const string& dev_path)
+{
+ dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+ int r;
+ ceph_assert(path_fd < 0);
+
+ ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+ if (!cct->_conf->bluestore_bluefs) {
+ derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+ return -EIO;
+ }
+
+ r = _mount_for_bluefs();
+
+ int reserved = 0;
+ string link_db;
+ string link_wal;
+ if (devs_source.count(BlueFS::BDEV_DB) &&
+ bluefs_shared_bdev != BlueFS::BDEV_DB) {
+ link_db = path + "/block.db";
+ }
+ if (devs_source.count(BlueFS::BDEV_WAL)) {
+ link_wal = path + "/block.wal";
+ }
+
+ size_t target_size;
+ string target_name;
+ if (id == BlueFS::BDEV_NEWWAL) {
+ target_name = "block.wal";
+ target_size = cct->_conf->bluestore_block_wal_size;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+ "bluefs wal",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = BDEV_LABEL_BLOCK_SIZE;
+ } else if (id == BlueFS::BDEV_NEWDB) {
+ target_name = "block.db";
+ target_size = cct->_conf->bluestore_block_db_size;
+
+ r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+ cct->_conf->bdev_enable_discard);
+ ceph_assert(r == 0);
+
+ if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+ r = _check_or_set_bdev_label(
+ dev_path,
+ bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+ "bluefs db",
+ true);
+ ceph_assert(r == 0);
+ }
+ reserved = SUPER_RESERVED;
+ }
+
+ bluefs->umount();
+ bluefs->mount();
+
+ bluefs->add_block_extent(
+ id, reserved, bluefs->get_block_device_size(id) - reserved);
+
+ r = bluefs->device_migrate_to_new(cct, devs_source, id);
+
+ if (r < 0) {
+ derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+ goto shutdown;
+ }
+
+ if (!link_db.empty()) {
+ r = unlink(link_db.c_str());
+ ceph_assert(r == 0);
+ }
+ if (!link_wal.empty()) {
+ r = unlink(link_wal.c_str());
+ ceph_assert(r == 0);
+ }
+ r = _setup_block_symlink_or_file(
+ target_name,
+ dev_path,
+ target_size,
+ true);
+ ceph_assert(r == 0);
+ dout(0) << __func__ << " success" << dendl;
+
+shutdown:
+ _umount_for_bluefs();
+ return r;
+}
+
void BlueStore::set_cache_shards(unsigned num)
{
dout(10) << __func__ << " " << num << dendl;
// bluefs alloc
if (cct->_conf->bluestore_bluefs) {
+ {
+ bluefs_extents.clear();
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "bluefs_extents_back", &bl);
+ auto p = bl.cbegin();
+ try {
+ decode(bluefs_extents, p);
+ }
+ catch (buffer::error& e) {
+ dout(0) << __func__ << " unable to read bluefs_extents_back" << dendl;
+ //return -EIO;
+ }
+ dout(10) << __func__ << " bluefs_extents_back 0x" << std::hex << bluefs_extents
+ << std::dec << dendl;
+ }
bluefs_extents.clear();
bufferlist bl;
db->get(PREFIX_SUPER, "bluefs_extents", &bl);
}
dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents
<< std::dec << dendl;
+
}
// ondisk format
// its initialization (and outside of _open_bdev)
void _validate_bdev();
void _close_bdev();
+
+ int _open_bluefs(bool create);
+ void _close_bluefs();
+
+ // Limited (u)mount intended for BlueFS operations only
+ int _mount_for_bluefs();
+ void _umount_for_bluefs();
+
+
/*
* @warning to_repair_db means that we open this db to repair it, will not
* hold the rocksdb's file lock.
f->close_section();
}
+ int add_new_bluefs_device(int id, const string& path);
+ int migrate_to_existing_bluefs_device(const set<int>& devs_source,
+ int id);
+ int migrate_to_new_bluefs_device(const set<int>& devs_source,
+ int id,
+ const string& path);
+
public:
int statfs(struct store_statfs_t *buf) override;
return nullptr;
}
-void add_devices(
- BlueFS *fs,
+void parse_devices(
CephContext *cct,
- const vector<string>& devs)
+ const vector<string>& devs,
+ map<string, int>* got,
+ bool* has_db,
+ bool* has_wal)
{
string main;
- set<int> got;
- for (auto& i : devs) {
+ bool was_db = false;
+ if (has_wal) {
+ *has_wal = false;
+ }
+ if (has_db) {
+ *has_db = false;
+ }
+ for (auto& d : devs) {
bluestore_bdev_label_t label;
- int r = BlueStore::_read_bdev_label(cct, i, &label);
+ int r = BlueStore::_read_bdev_label(cct, d, &label);
if (r < 0) {
- cerr << "unable to read label for " << i << ": "
+ cerr << "unable to read label for " << d << ": "
<< cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
int id = -1;
if (label.description == "main")
- main = i;
- else if (label.description == "bluefs db")
+ main = d;
+ else if (label.description == "bluefs db") {
id = BlueFS::BDEV_DB;
- else if (label.description == "bluefs wal")
+ was_db = true;
+ if (has_db) {
+ *has_db = true;
+ }
+ }
+ else if (label.description == "bluefs wal") {
id = BlueFS::BDEV_WAL;
- if (id >= 0) {
- got.insert(id);
- cout << " slot " << id << " " << i << std::endl;
- int r = fs->add_block_device(id, i, false);
- if (r < 0) {
- cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl;
- exit(EXIT_FAILURE);
+ if (has_wal) {
+ *has_wal = true;
}
}
+ if (id >= 0) {
+ got->emplace(d, id);
+ }
}
if (main.length()) {
- int id = BlueFS::BDEV_DB;
- if (got.count(BlueFS::BDEV_DB))
- id = BlueFS::BDEV_SLOW;
- cout << " slot " << id << " " << main << std::endl;
- int r = fs->add_block_device(id, main, false);
+ int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB;
+ got->emplace(main, id);
+ }
+}
+
+void add_devices(
+ BlueFS *fs,
+ CephContext *cct,
+ const vector<string>& devs)
+{
+ map<string, int> got;
+ parse_devices(cct, devs, &got, nullptr, nullptr);
+ for(auto e : got) {
+ cout << " slot " << e.second << " " << e.first << std::endl;
+ int r = fs->add_block_device(e.second, e.first, false);
if (r < 0) {
- cerr << "unable to open " << main << ": " << cpp_strerror(r)
- << std::endl;
+ cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
exit(EXIT_FAILURE);
}
}
{
string out_dir;
vector<string> devs;
+ vector<string> devs_source;
+ string dev_target;
string path;
string action;
string log_file;
("log-file,l", po::value<string>(&log_file), "log file")
("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
("dev", po::value<vector<string>>(&devs), "device(s)")
+ ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)")
+ ("dev-target", po::value<string>(&dev_target), "target/resulting device")
("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
("key,k", po::value<string>(&key), "label metadata key name")
("value,v", po::value<string>(&value), "label metadata value")
;
po::options_description po_positional("Positional options");
po_positional.add_options()
- ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump")
+ ("command", po::value<string>(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, bluefs-bdev-new-db, bluefs-bdev-new-wal, bluefs-bdev-migrate, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump")
;
po::options_description po_all("All options");
po_all.add(po_options).add(po_positional);
}
inferring_bluefs_devices(devs, path);
}
+ if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (dev_target.empty()) {
+ cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl;
+ }
+ inferring_bluefs_devices(devs, path);
+ }
+ if (action == "bluefs-bdev-migrate") {
+ if (path.empty()) {
+ cerr << "must specify bluestore path" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ inferring_bluefs_devices(devs, path);
+ if (devs_source.size() == 0) {
+ cerr << "must specify source devices with --devs-source" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (dev_target.empty()) {
+ cerr << "must specify target device with --dev-target" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
vector<const char*> args;
if (log_file.size()) {
delete fs;
} else if (action == "bluefs-log-dump") {
log_dump(cct.get(), path, devs);
+ } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+ map<string, int> cur_devs_map;
+ bool need_db = action == "bluefs-bdev-new-db";
+
+ bool has_wal = false;
+ bool has_db = false;
+ parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal);
+
+ if (has_db && has_wal) {
+ cerr << "can't allocate new device, both WAL and DB exist"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else if (need_db && has_db) {
+ cerr << "can't allocate new DB device, already exists"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else if (!need_db && has_wal) {
+ cerr << "can't allocate new WAL device, already exists"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ // Create either DB or WAL volume
+ BlueStore bluestore(cct.get(), path);
+
+ char target_path[PATH_MAX] = "";
+ if(!dev_target.empty()) {
+ if (realpath(dev_target.c_str(), target_path) == nullptr) {
+ cerr << "failed to retrieve absolute path for " << dev_target
+ << ": " << cpp_strerror(errno)
+ << std::endl;
+ }
+ }
+ int r = bluestore.add_new_bluefs_device(
+ need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL,
+ target_path);
+ if (r == 0) {
+ cout << (need_db ? "DB" : "WAL") << " device added " << target_path
+ << std::endl;
+ } else {
+ cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:"
+ << cpp_strerror(r)
+ << std::endl;
+ }
+ }
+ } else if (action == "bluefs-bdev-migrate") {
+ map<string, int> cur_devs_map;
+ set<int> src_dev_ids;
+ map<string, int> src_devs;
+
+
+ parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr);
+ for (auto& s : devs_source) {
+ auto i = cur_devs_map.find(s);
+ if (i != cur_devs_map.end()) {
+ src_devs.emplace(*i);
+ src_dev_ids.emplace(i->second);
+ } else {
+ cerr << "can't migrate " << s << ", not a valid bluefs volume "
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ auto i = cur_devs_map.find(dev_target);
+
+ if (i != cur_devs_map.end()) {
+ // Migrate to an existing BlueFS volume
+
+ auto dev_target_id = i->second;
+ if (dev_target_id == BlueFS::BDEV_WAL) {
+ // currently we're unable to migrate to WAL device since there is no space
+ // reserved for superblock
+ cerr << "Migrate to WAL device isn't supported." << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+
+ BlueStore bluestore(cct.get(), path);
+ int r = bluestore.migrate_to_existing_bluefs_device(
+ src_dev_ids,
+ dev_target_id);
+ if (r == 0) {
+ for(auto src : src_devs) {
+ if (src.second != BlueFS::BDEV_SLOW) {
+ cout << " device removed:" << src.second << " " << src.first
+ << std::endl;
+ }
+ }
+ } else {
+ cerr << "failed to migrate to existing BlueFS device: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+ << " " << dev_target
+ << cpp_strerror(r)
+ << std::endl;
+ }
+ ceph_assert(r == 0);
+ } else {
+ // Migrate to a new BlueFS volume
+ // via creating either DB or WAL volume
+ int dev_target_id;
+ if (src_dev_ids.count(BlueFS::BDEV_DB)) {
+ // if we have DB device in the source list - we create DB device
+ // (and may be remove WAL).
+ dev_target_id = BlueFS::BDEV_NEWDB;
+ } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) {
+ dev_target_id = BlueFS::BDEV_NEWWAL;
+ } else {
+ cerr << "Unable to migrate Slow volume to new location, "
+ "please allocate new DB or WAL with "
+ "--bluefs-bdev-new-db(wal) command"
+ << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ BlueStore bluestore(cct.get(), path);
+
+ char target_path[PATH_MAX] = "";
+ if(!dev_target.empty()) {
+ if (realpath(dev_target.c_str(), target_path) == nullptr) {
+ cerr << "failed to retrieve absolute path for " << dev_target
+ << ": " << cpp_strerror(errno)
+ << std::endl;
+ }
+ }
+ bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+ int r = bluestore.migrate_to_new_bluefs_device(
+ src_dev_ids,
+ dev_target_id,
+ target_path);
+ if (r == 0) {
+ for(auto src : src_devs) {
+ if (src.second != BlueFS::BDEV_SLOW) {
+ cout << " device removed:" << src.second << " " << src.first
+ << std::endl;
+ }
+ }
+ cout << " device added: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+ << " " << target_path
+ << std::endl;
+ } else {
+ cerr << "failed to migrate to new BlueFS device: "
+ << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+ << " " << target_path
+ << cpp_strerror(r)
+ << std::endl;
+ }
+
+ ceph_assert(r == 0);
+ }
} else {
cerr << "unrecognized action " << action << std::endl;
return 1;