From: Igor Fedotov Date: Tue, 17 Jul 2018 13:06:36 +0000 (+0300) Subject: os/bluestore: introduce offline DB/WAL volume migration for X-Git-Tag: 3.2-0~142^2~3 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=a45e633ad304715fe3fefd02c0f720c6074a62ae;p=ceph-ci.git os/bluestore: introduce offline DB/WAL volume migration for ceph-bluestore-tool. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 058541a30ed..5d28a22ddc4 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -377,7 +377,7 @@ int BlueFS::mkfs(uuid_d osd_uuid) // write supers super.log_fnode = log_file->fnode; - _write_super(); + _write_super(BDEV_DB); flush_bdev(); // clean up @@ -491,6 +491,31 @@ void BlueFS::umount() _shutdown_logger(); } +int BlueFS::prepare_new_device(int id) +{ + dout(1) << __func__ << dendl; + + if(id == BDEV_NEWDB) { + int new_log_dev_cur = BDEV_WAL; + int new_log_dev_next = BDEV_WAL; + if (!bdev[BDEV_WAL]) { + new_log_dev_cur = BDEV_NEWDB; + new_log_dev_next = BDEV_DB; + } + _rewrite_log_sync(false, + BDEV_NEWDB, + new_log_dev_cur, + new_log_dev_next, + RENAME_DB2SLOW); + //} + } else if(id == BDEV_NEWWAL) { + _rewrite_log_sync(false, BDEV_DB, BDEV_NEWWAL, BDEV_WAL, REMOVE_WAL); + } else { + assert(false); + } + return 0; +} + void BlueFS::collect_metadata(map *pm, unsigned skip_bdev_id) { if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB]) @@ -516,7 +541,7 @@ int BlueFS::fsck() return 0; } -int BlueFS::_write_super() +int BlueFS::_write_super(int dev) { // build superblock bufferlist bl; @@ -529,7 +554,7 @@ int BlueFS::_write_super() ceph_assert(bl.length() <= get_super_length()); bl.append_zero(get_super_length() - bl.length()); - bdev[BDEV_DB]->write(get_super_offset(), bl, false); + bdev[dev]->write(get_super_offset(), bl, false); dout(20) << __func__ << " v " << super.version << " crc 0x" << std::hex << crc << " offset 0x" << get_super_offset() << std::dec @@ -972,6 +997,261 @@ int BlueFS::log_dump() return 0; } +int BlueFS::device_migrate_to_existing( + CephContext *cct, + const set& devs_source, + int dev_target) +{ + vector buf; + bool buffered = cct->_conf->bluefs_buffered_io; + + assert(dev_target < (int)MAX_BDEV); + + int flags = 0; + flags |= devs_source.count(BDEV_DB) ? + (REMOVE_DB | RENAME_SLOW2DB) : 0; + flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; + int dev_target_new = dev_target; + + // Slow device without separate DB one is addressed via BDEV_DB + // Hence need renaming. + if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) { + dev_target_new = BDEV_DB; + dout(0) << __func__ << " super to be written to " << dev_target << dendl; + } + + for (auto& p : file_map) { + //do not copy log + if (p.second->fnode.ino == 1) { + continue; + } + auto& fnode_extents = p.second->fnode.extents; + + for (auto ext_it = fnode_extents.begin(); + ext_it != p.second->fnode.extents.end(); + ++ext_it) { + if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { + bluefs_extent_t old_ext = *ext_it; + PExtentVector extents; + auto l = + _allocate_without_fallback(dev_target, old_ext.length, &extents); + if (l == 0) { + buf.resize(old_ext.length); + int r = bdev[old_ext.bdev]->read_random( + old_ext.offset, + old_ext.length, + (char*)&buf.at(0), + buffered); + if (r != 0) { + derr << __func__ << " failed to read 0x" << std::hex + << old_ext.offset << "~" < 0); + uint64_t src_buf_pos = 0; + { + // overwrite existing extent + *ext_it= + bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length); + bufferlist bl; + bl.append((char*)&buf.at(src_buf_pos), extents[0].length); + int r = bdev[dev_target]->write(extents[0].offset, bl, buffered); + ceph_assert(r == 0); + src_buf_pos += extents[0].length; + } + // then insert more extents if needed + for( size_t i = 1; i < extents.size(); ++i) { + bufferlist bl; + bl.append((char*)&buf.at(src_buf_pos), extents[i].length); + ++ext_it; + ext_it = fnode_extents.emplace(ext_it, dev_target_new, + extents[i].offset, extents[i].length); + int r = bdev[dev_target]->write(extents[i].offset, bl, buffered); + ceph_assert(r == 0); + src_buf_pos += extents[i].length; + } + { + PExtentVector to_release; + to_release.emplace_back(old_ext.offset, old_ext.length); + alloc[old_ext.bdev]->release(to_release); + } + + } else { + derr << __func__ << " unable to allocate len 0x" << std::hex + << old_ext.length << std::dec << " from " << (int)dev_target + << dendl; + return -ENOSPC; + } + } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) { + ext_it->bdev = dev_target_new; + } + } + auto& prefer_bdev = p.second->fnode.prefer_bdev; + if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { + prefer_bdev = dev_target_new; + } + } + // new logging device in the current naming scheme + int new_log_dev_cur = bdev[BDEV_WAL] ? + BDEV_WAL : + bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW; + + // new logging device in new naming scheme + int new_log_dev_next = new_log_dev_cur; + + if (devs_source.count(new_log_dev_cur)) { + // SLOW device is addressed via BDEV_DB too hence either WAL or DB + new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ? + BDEV_DB : + BDEV_WAL; + + dout(0) << __func__ << " log moved from " << new_log_dev_cur + << " to " << new_log_dev_next << dendl; + + new_log_dev_cur = + (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ? + BDEV_SLOW : + new_log_dev_next; + } + + _rewrite_log_sync( + false, + (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB, + new_log_dev_cur, + new_log_dev_next, + flags); + return 0; +} + +int BlueFS::device_migrate_to_new( + CephContext *cct, + const set& devs_source, + int dev_target) +{ + vector buf; + bool buffered = cct->_conf->bluefs_buffered_io; + + assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL); + + int flags = 0; + + flags |= devs_source.count(BDEV_DB) ? + (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : + 0; + flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; + int dev_target_new = dev_target; + + for (auto& p : file_map) { + //do not copy log + if (p.second->fnode.ino == 1) { + continue; + } + auto& fnode_extents = p.second->fnode.extents; + + for (auto ext_it = fnode_extents.begin(); + ext_it != p.second->fnode.extents.end(); + ++ext_it) { + if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) { + bluefs_extent_t old_ext = *ext_it; + PExtentVector extents; + auto l = + _allocate_without_fallback(dev_target, old_ext.length, &extents); + if (l == 0) { + buf.resize(old_ext.length); + int r = bdev[old_ext.bdev]->read_random( + old_ext.offset, + old_ext.length, + (char*)&buf.at(0), + buffered); + dout(10)<<__func__<<" read = "< 0); + uint64_t src_buf_pos = 0; + { + // overwrite existing extent + *ext_it= + bluefs_extent_t(dev_target_new, extents[0].offset, extents[0].length); + bufferlist bl; + bl.append((char*)&buf.at(src_buf_pos), extents[0].length); + int r = bdev[dev_target]->write(extents[0].offset, bl, buffered); + ceph_assert(r == 0); + src_buf_pos += extents[0].length; + } + // then insert more extents if needed + for( size_t i = 1; i < extents.size(); ++i) { + bufferlist bl; + bl.append((char*)&buf.at(src_buf_pos), extents[i].length); + ++ext_it; + ext_it = fnode_extents.emplace(ext_it, dev_target_new, + extents[i].offset, extents[i].length); + int r = bdev[dev_target]->write(extents[i].offset, bl, buffered); + ceph_assert(r == 0); + src_buf_pos += extents[i].length; + } + { + PExtentVector to_release; + to_release.emplace_back(old_ext.offset, old_ext.length); + alloc[old_ext.bdev]->release(to_release); + } + } else { + derr << __func__ << " unable to allocate len 0x" << std::hex + << old_ext.length << std::dec << " from " << (int)dev_target + << dendl; + return -ENOSPC; + } + } else if (dev_target != dev_target_new && ext_it->bdev == dev_target) { + ext_it->bdev = dev_target_new; + } + } + auto& prefer_bdev = p.second->fnode.prefer_bdev; + if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { + prefer_bdev = dev_target_new; + } + } + // new logging device in the current naming scheme + int new_log_dev_cur = + bdev[BDEV_NEWWAL] ? + BDEV_NEWWAL : + bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ? + BDEV_WAL : + bdev[BDEV_NEWDB] ? + BDEV_NEWDB : + bdev[BDEV_DB] && !(flags & REMOVE_DB)? + BDEV_DB : + BDEV_SLOW; + + // new logging device in new naming scheme + int new_log_dev_next = + new_log_dev_cur == BDEV_NEWWAL ? + BDEV_WAL : + new_log_dev_cur == BDEV_NEWDB ? + BDEV_DB : + new_log_dev_cur; + + int super_dev = + dev_target == BDEV_NEWDB ? + BDEV_NEWDB : + bdev[BDEV_DB] ? + BDEV_DB : + BDEV_SLOW; + + _rewrite_log_sync( + false, + super_dev, + new_log_dev_cur, + new_log_dev_next, + flags); + return 0; +} + BlueFS::FileRef BlueFS::_get_file(uint64_t ino) { auto p = file_map.find(ino); @@ -1204,7 +1484,8 @@ bool BlueFS::_should_compact_log() return true; } -void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t) +void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, + int flags) { t->seq = 1; t->uuid = super.uuid; @@ -1214,17 +1495,63 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t) for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) { interval_set& p = block_all[bdev]; for (interval_set::iterator q = p.begin(); q != p.end(); ++q) { - dout(20) << __func__ << " op_alloc_add " << bdev << " 0x" + auto bdev_new = bdev; + if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) { + continue; + } + if ((flags & REMOVE_DB) && bdev == BDEV_DB) { + continue; + } + if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { + bdev_new = BDEV_DB; + } + if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { + bdev_new = BDEV_SLOW; + } + if (bdev == BDEV_NEWDB) { + // REMOVE_DB xor RENAME_DB + ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); + ceph_assert(!(flags & RENAME_SLOW2DB)); + bdev_new = BDEV_DB; + } + if (bdev == BDEV_NEWWAL) { + ceph_assert(flags & REMOVE_WAL); + bdev_new = BDEV_WAL; + } + dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x" << std::hex << q.get_start() << "~" << q.get_len() << std::dec << dendl; - t->op_alloc_add(bdev, q.get_start(), q.get_len()); + t->op_alloc_add(bdev_new, q.get_start(), q.get_len()); } } for (auto& p : file_map) { if (p.first == 1) continue; - dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl; ceph_assert(p.first > 1); + + for(auto& e : p.second->fnode.extents) { + auto bdev = e.bdev; + auto bdev_new = bdev; + ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL)); + if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) { + bdev_new = BDEV_DB; + } + if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) { + bdev_new = BDEV_SLOW; + } + if (bdev == BDEV_NEWDB) { + // REMOVE_DB xor RENAME_DB + ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW)); + ceph_assert(!(flags & RENAME_SLOW2DB)); + bdev_new = BDEV_DB; + } + if (bdev == BDEV_NEWWAL) { + ceph_assert(flags & REMOVE_WAL); + bdev_new = BDEV_WAL; + } + e.bdev = bdev_new; + } + dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl; t->op_file_update(p.second->fnode); } for (auto& p : dir_map) { @@ -1241,13 +1568,32 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t) void BlueFS::_compact_log_sync() { dout(10) << __func__ << dendl; + _rewrite_log_sync(true, + BDEV_DB, + log_writer->file->fnode.prefer_bdev, + log_writer->file->fnode.prefer_bdev, + 0); + logger->inc(l_bluefs_log_compactions); +} + +void BlueFS::_rewrite_log_sync(bool allocate_with_fallback, + int super_dev, + int log_dev, + int log_dev_new, + int flags) +{ File *log_file = log_writer->file.get(); // clear out log (be careful who calls us!!!) log_t.clear(); + dout(20) << __func__ << " super_dev:" << super_dev + << " log_dev:" << log_dev + << " log_dev_new:" << log_dev_new + << " flags:" << flags + << dendl; bluefs_transaction_t t; - _compact_log_dump_metadata(&t); + _compact_log_dump_metadata(&t, flags); dout(20) << __func__ << " op_jump_seq " << log_seq << dendl; t.op_jump_seq(log_seq); @@ -1261,9 +1607,22 @@ void BlueFS::_compact_log_sync() mempool::bluefs::vector old_extents; uint64_t old_allocated = 0; + int r; log_file->fnode.swap_extents(old_extents, old_allocated); - int r = _allocate(log_file->fnode.prefer_bdev, need, &log_file->fnode); - ceph_assert(r == 0); + if (allocate_with_fallback) { + r = _allocate(log_dev, need, &log_file->fnode); + ceph_assert(r == 0); + } else { + PExtentVector extents; + r = _allocate_without_fallback(log_dev, + need, + &extents); + ceph_assert(r == 0); + for (auto& p : extents) { + log_file->fnode.append_extent( + bluefs_extent_t(log_dev, p.offset, p.length)); + } + } _close_writer(log_writer); @@ -1282,18 +1641,24 @@ void BlueFS::_compact_log_sync() #endif flush_bdev(); - dout(10) << __func__ << " writing super" << dendl; super.log_fnode = log_file->fnode; + // rename device if needed + if (log_dev != log_dev_new) { + dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl; + for (auto& p : super.log_fnode.extents) { + p.bdev = log_dev_new; + } + } + dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl; + ++super.version; - _write_super(); + _write_super(super_dev); flush_bdev(); dout(10) << __func__ << " release old log extents " << old_extents << dendl; for (auto& r : old_extents) { pending_release[r.bdev].insert(r.offset, r.length); } - - logger->inc(l_bluefs_log_compactions); } /* @@ -1360,7 +1725,7 @@ void BlueFS::_compact_log_async(std::unique_lock& l) bluefs_transaction_t t; //avoid record two times in log_t and _compact_log_dump_metadata. log_t.clear(); - _compact_log_dump_metadata(&t); + _compact_log_dump_metadata(&t, 0); // conservative estimate for final encoded size new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2, @@ -1432,7 +1797,7 @@ void BlueFS::_compact_log_async(std::unique_lock& l) dout(10) << __func__ << " writing super" << dendl; super.log_fnode = log_file->fnode; ++super.version; - _write_super(); + _write_super(BDEV_DB); lock.unlock(); flush_bdev(); @@ -1815,8 +2180,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow); for (unsigned i = 0; i < MAX_BDEV; ++i) { if (bdev[i]) { - ceph_assert(h->iocv[i]); - if (h->iocv[i]->has_pending_aios()) { + if (h->iocv[i] && h->iocv[i]->has_pending_aios()) { bdev[i]->aio_submit(h->iocv[i]); } } @@ -1982,6 +2346,40 @@ void BlueFS::flush_bdev() } } +int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len, + PExtentVector* extents) +{ + dout(10) << __func__ << " len 0x" << std::hex << len << std::dec + << " from " << (int)id << dendl; + assert(id < alloc.size()); + uint64_t min_alloc_size = cct->_conf->bluefs_alloc_size; + + uint64_t left = round_up_to(len, min_alloc_size); + + if (!alloc[id]) { + return -ENOENT; + } + extents->reserve(4); // 4 should be (more than) enough for most allocations + int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents); + if (alloc_len < (int64_t)left) { + if (alloc_len != 0) { + alloc[id]->release(*extents); + } + if (bdev[id]) + derr << __func__ << " failed to allocate 0x" << std::hex << left + << " on bdev " << (int)id + << ", free 0x" << alloc[id]->get_free() << std::dec << dendl; + else + derr << __func__ << " failed to allocate 0x" << std::hex << left + << " on bdev " << (int)id << ", dne" << std::dec << dendl; + if (alloc[id]) + alloc[id]->dump(); + return -ENOSPC; + } + + return 0; +} + int BlueFS::_allocate(uint8_t id, uint64_t len, bluefs_fnode_t* node) { @@ -2187,9 +2585,10 @@ void BlueFS::_close_writer(FileWriter *h) dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl; for (unsigned i=0; iiocv[i]); - h->iocv[i]->aio_wait(); - bdev[i]->queue_reap_ioc(h->iocv[i]); + if (h->iocv[i]) { + h->iocv[i]->aio_wait(); + bdev[i]->queue_reap_ioc(h->iocv[i]); + } } } delete h; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 333c16b54bb..9043634c8c9 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -42,10 +42,12 @@ enum { class BlueFS { public: CephContext* cct; - static constexpr unsigned MAX_BDEV = 3; + static constexpr unsigned MAX_BDEV = 5; static constexpr unsigned BDEV_WAL = 0; static constexpr unsigned BDEV_DB = 1; static constexpr unsigned BDEV_SLOW = 2; + static constexpr unsigned BDEV_NEWWAL = 3; + static constexpr unsigned BDEV_NEWDB = 4; enum { WRITER_UNKNOWN, @@ -273,6 +275,9 @@ private: int _allocate(uint8_t bdev, uint64_t len, bluefs_fnode_t* node); + int _allocate_without_fallback(uint8_t id, uint64_t len, + PExtentVector* extents); + int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); int _flush(FileWriter *h, bool force); int _fsync(FileWriter *h, std::unique_lock& l); @@ -287,10 +292,24 @@ private: uint64_t jump_to = 0); uint64_t _estimate_log_size(); bool _should_compact_log(); - void _compact_log_dump_metadata(bluefs_transaction_t *t); + + enum { + REMOVE_DB = 1, + REMOVE_WAL = 2, + RENAME_SLOW2DB = 4, + RENAME_DB2SLOW = 8, + }; + void _compact_log_dump_metadata(bluefs_transaction_t *t, + int flags); void _compact_log_sync(); void _compact_log_async(std::unique_lock& l); + void _rewrite_log_sync(bool allocate_with_fallback, + int super_dev, + int log_dev, + int new_log_dev, + int flags); + //void _aio_finish(void *priv); void _flush_bdev_safely(FileWriter *h); @@ -316,7 +335,7 @@ private: void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); int _open_super(); - int _write_super(); + int _write_super(int dev); int _replay(bool noop, bool to_stdout = false); ///< replay journal FileWriter *_create_writer(FileRef f); @@ -339,6 +358,7 @@ public: int mkfs(uuid_d osd_uuid); int mount(); void umount(); + int prepare_new_device(int id); int log_dump(); @@ -346,6 +366,15 @@ public: void get_devices(set *ls); int fsck(); + int device_migrate_to_new( + CephContext *cct, + const set& devs_source, + int dev_target); + int device_migrate_to_existing( + CephContext *cct, + const set& devs_source, + int dev_target); + uint64_t get_used(); uint64_t get_total(unsigned id); uint64_t get_free(unsigned id); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 280fba4a917..5521045833d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4885,6 +4885,156 @@ bool BlueStore::test_mount_in_use() return ret; } +int BlueStore::_open_bluefs(bool create) +{ + int r; + bluefs = new BlueFS(cct); + + string bfn; + struct stat st; + + bfn = path + "/block.db"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn, + create && cct->_conf->bdev_enable_discard); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_DB), + "bluefs db", create); + if (r < 0) { + derr << __func__ + << " check block device(" << bfn << ") label returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + if (create) { + bluefs->add_block_extent( + BlueFS::BDEV_DB, + SUPER_RESERVED, + bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED); + } + bluefs_shared_bdev = BlueFS::BDEV_SLOW; + bluefs_single_shared_device = false; + } else { + r = -errno; + if (::lstat(bfn.c_str(), &st) == -1) { + r = 0; + bluefs_shared_bdev = BlueFS::BDEV_DB; + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + // shared device + bfn = path + "/block"; + // never trim here + r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + if (create) { + // note: we always leave the first SUPER_RESERVED (8k) of the device unused + uint64_t initial = + bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + + cct->_conf->bluestore_bluefs_gift_ratio); + initial = std::max(initial, cct->_conf->bluestore_bluefs_min); + if (cct->_conf->bluefs_alloc_size % min_alloc_size) { + derr << __func__ << " bluefs_alloc_size 0x" << std::hex + << cct->_conf->bluefs_alloc_size << " is not a multiple of " + << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl; + r = -EINVAL; + goto free_bluefs; + } + // align to bluefs's alloc_size + initial = p2roundup(initial, cct->_conf->bluefs_alloc_size); + // put bluefs in the middle of the device in case it is an HDD + uint64_t start = p2align((bdev->get_size() - initial) / 2, + cct->_conf->bluefs_alloc_size); + //avoiding superblock overwrite + ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved()); + start = std::max(cct->_conf->bluefs_alloc_size, start); + + bluefs->add_block_extent(bluefs_shared_bdev, start, initial); + bluefs_extents.insert(start, initial); + } + + bfn = path + "/block.wal"; + if (::stat(bfn.c_str(), &st) == 0) { + r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, + create && cct->_conf->bdev_enable_discard); + if (r < 0) { + derr << __func__ << " add block device(" << bfn << ") returned: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) { + r = _check_or_set_bdev_label( + bfn, + bluefs->get_block_device_size(BlueFS::BDEV_WAL), + "bluefs wal", create); + if (r < 0) { + derr << __func__ << " check block device(" << bfn + << ") label returned: " << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + if (create) { + bluefs->add_block_extent( + BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, + bluefs->get_block_device_size(BlueFS::BDEV_WAL) - + BDEV_LABEL_BLOCK_SIZE); + } + kv_options["separate_wal_dir"] = "1"; + bluefs_single_shared_device = false; + } else { + r = -errno; + if (::lstat(bfn.c_str(), &st) == -1) { + kv_options.erase("separate_wal_dir"); + r = 0; + } else { + derr << __func__ << " " << bfn << " symlink exists but target unusable: " + << cpp_strerror(r) << dendl; + goto free_bluefs; + } + } + + if (create) { + bluefs->mkfs(fsid); + } + r = bluefs->mount(); + if (r < 0) { + derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; + goto free_bluefs; + } + + return 0; +free_bluefs: + ceph_assert(bluefs); + delete bluefs; + bluefs = NULL; + return r; +} + +void BlueStore::_close_bluefs() +{ + bluefs->umount(); + delete bluefs; + bluefs = NULL; +} int BlueStore::_open_db(bool create, bool to_repair_db) { int r; @@ -4938,138 +5088,9 @@ int BlueStore::_open_db(bool create, bool to_repair_db) derr << " backend must be rocksdb to use bluefs" << dendl; return -EINVAL; } - bluefs = new BlueFS(cct); - - string bfn; - struct stat st; - - bfn = path + "/block.db"; - if (::stat(bfn.c_str(), &st) == 0) { - r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn, - create && cct->_conf->bdev_enable_discard); - if (r < 0) { - derr << __func__ << " add block device(" << bfn << ") returned: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - - if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) { - r = _check_or_set_bdev_label( - bfn, - bluefs->get_block_device_size(BlueFS::BDEV_DB), - "bluefs db", create); - if (r < 0) { - derr << __func__ - << " check block device(" << bfn << ") label returned: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - } - if (create) { - bluefs->add_block_extent( - BlueFS::BDEV_DB, - SUPER_RESERVED, - bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED); - } - bluefs_shared_bdev = BlueFS::BDEV_SLOW; - bluefs_single_shared_device = false; - } else { - r = -errno; - if (::lstat(bfn.c_str(), &st) == -1) { - r = 0; - bluefs_shared_bdev = BlueFS::BDEV_DB; - } else { - derr << __func__ << " " << bfn << " symlink exists but target unusable: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - } - - // shared device - bfn = path + "/block"; - // never trim here - r = bluefs->add_block_device(bluefs_shared_bdev, bfn, false); - if (r < 0) { - derr << __func__ << " add block device(" << bfn << ") returned: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - if (create) { - // note: we always leave the first SUPER_RESERVED (8k) of the device unused - uint64_t initial = - bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + - cct->_conf->bluestore_bluefs_gift_ratio); - initial = std::max(initial, cct->_conf->bluestore_bluefs_min); - if (cct->_conf->bluefs_alloc_size % min_alloc_size) { - derr << __func__ << " bluefs_alloc_size 0x" << std::hex - << cct->_conf->bluefs_alloc_size << " is not a multiple of " - << "min_alloc_size 0x" << min_alloc_size << std::dec << dendl; - r = -EINVAL; - goto free_bluefs; - } - // align to bluefs's alloc_size - initial = p2roundup(initial, cct->_conf->bluefs_alloc_size); - // put bluefs in the middle of the device in case it is an HDD - uint64_t start = p2align((bdev->get_size() - initial) / 2, - cct->_conf->bluefs_alloc_size); - //avoiding superblock overwrite - ceph_assert(cct->_conf->bluefs_alloc_size > _get_ondisk_reserved()); - start = std::max(cct->_conf->bluefs_alloc_size, start); - - bluefs->add_block_extent(bluefs_shared_bdev, start, initial); - bluefs_extents.insert(start, initial); - } - - bfn = path + "/block.wal"; - if (::stat(bfn.c_str(), &st) == 0) { - r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, - create && cct->_conf->bdev_enable_discard); - if (r < 0) { - derr << __func__ << " add block device(" << bfn << ") returned: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) { - r = _check_or_set_bdev_label( - bfn, - bluefs->get_block_device_size(BlueFS::BDEV_WAL), - "bluefs wal", create); - if (r < 0) { - derr << __func__ << " check block device(" << bfn - << ") label returned: " << cpp_strerror(r) << dendl; - goto free_bluefs; - } - } + r = _open_bluefs(create); - if (create) { - bluefs->add_block_extent( - BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, - bluefs->get_block_device_size(BlueFS::BDEV_WAL) - - BDEV_LABEL_BLOCK_SIZE); - } - kv_options["separate_wal_dir"] = "1"; - bluefs_single_shared_device = false; - } else { - r = -errno; - if (::lstat(bfn.c_str(), &st) == -1) { - kv_options.erase("separate_wal_dir"); - r = 0; - } else { - derr << __func__ << " " << bfn << " symlink exists but target unusable: " - << cpp_strerror(r) << dendl; - goto free_bluefs; - } - } - - if (create) { - bluefs->mkfs(fsid); - } - r = bluefs->mount(); - if (r < 0) { - derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; - goto free_bluefs; - } if (cct->_conf->bluestore_bluefs_env_mirror) { rocksdb::Env *a = new BlueRocksEnv(bluefs); rocksdb::Env *b = rocksdb::Env::Default(); @@ -5143,9 +5164,7 @@ int BlueStore::_open_db(bool create, bool to_repair_db) if (!db) { derr << __func__ << " error creating db" << dendl; if (bluefs) { - bluefs->umount(); - delete bluefs; - bluefs = NULL; + _close_bluefs(); } // delete env manually here since we can't depend on db to do this // under this case @@ -5194,12 +5213,6 @@ int BlueStore::_open_db(bool create, bool to_repair_db) dout(1) << __func__ << " opened " << kv_backend << " path " << fn << " options " << options << dendl; return 0; - -free_bluefs: - ceph_assert(bluefs); - delete bluefs; - bluefs = NULL; - return r; } void BlueStore::_close_db() @@ -5208,9 +5221,7 @@ void BlueStore::_close_db() delete db; db = NULL; if (bluefs) { - bluefs->umount(); - delete bluefs; - bluefs = NULL; + _close_bluefs(); } } @@ -5312,9 +5323,10 @@ int BlueStore::allocate_bluefs_freespace(uint64_t size) << bluefs_extents << std::dec << dendl; synct->set(PREFIX_SUPER, "bluefs_extents", bl); - int r = db->submit_transaction_sync(synct); - assert(r == 0); + synct->set(PREFIX_SUPER, "bluefs_extents_back", bl); + int r = db->submit_transaction_sync(synct); + ceph_assert(r == 0); } return 0; } @@ -5403,13 +5415,13 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents) if (alloc_len <= 0) { dout(0) << __func__ << " no allocate on 0x" << std::hex << gift - << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size + << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size << std::dec << dendl; _dump_alloc_on_rebalance_failure(); return 0; } else if (alloc_len < (int64_t)gift) { dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift - << " min_alloc_size 0x" << cct->_conf->bluefs_alloc_size + << " bluefs_alloc_size 0x" << cct->_conf->bluefs_alloc_size << " allocated 0x" << alloc_len << std::dec << dendl; _dump_alloc_on_rebalance_failure(); @@ -5796,6 +5808,277 @@ int BlueStore::mkfs() return r; } +int BlueStore::_mount_for_bluefs() +{ + int r = _open_path(); + ceph_assert(r == 0); + r = _open_fsid(false); + ceph_assert(r == 0); + r = _read_fsid(&fsid); + ceph_assert(r == 0); + r = _lock_fsid(); + ceph_assert(r == 0); + r = _open_bluefs(false); + ceph_assert(r == 0); + return r; +} + +void BlueStore::_umount_for_bluefs() +{ + _close_bluefs(); + _close_fsid(); + _close_path(); +} + +int BlueStore::add_new_bluefs_device(int id, const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + int r; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + r = _mount_for_bluefs(); + + int reserved; + if (id == BlueFS::BDEV_NEWWAL) { + string p = path + "/block.wal"; + r = _setup_block_symlink_or_file("block.wal", dev_path, + cct->_conf->bluestore_block_wal_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + + reserved = BDEV_LABEL_BLOCK_SIZE; + } else if (id == BlueFS::BDEV_NEWDB) { + string p = path + "/block.db"; + r = _setup_block_symlink_or_file("block.db", dev_path, + cct->_conf->bluestore_block_db_size, + true); + ceph_assert(r == 0); + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + p, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + reserved = SUPER_RESERVED; + } + + bluefs->umount(); + bluefs->mount(); + + bluefs->add_block_extent( + id, + reserved, + bluefs->get_block_device_size(id) - reserved); + + r = bluefs->prepare_new_device(id); + ceph_assert(r == 0); + + if (r < 0) { + derr << __func__ << " failed, " << cpp_strerror(r) << dendl; + } else { + dout(0) << __func__ << " success" << dendl; + } + + _umount_for_bluefs(); + return r; +} + +int BlueStore::migrate_to_existing_bluefs_device(const set& devs_source, + int id) +{ + dout(10) << __func__ << " id:" << id << dendl; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + int r = _mount_for_bluefs(); + + // require bluestore_bluefs_min_free to be free at target device! + uint64_t used_space = cct->_conf.get_val("bluestore_bluefs_min_free"); + for(auto src_id : devs_source) { + used_space += bluefs->get_total(src_id) - bluefs->get_free(src_id); + } + uint64_t target_free = bluefs->get_free(id); + if (id == BlueFS::BDEV_SLOW && target_free < used_space) { + // will need to remount full BlueStore instance to allocate more space + _umount_for_bluefs(); + + r = mount(); + ceph_assert(r == 0); + dout(1) << __func__ + << " Allocating more space at slow device for BlueFS: +" + << used_space - target_free << " bytes" << dendl; + r = allocate_bluefs_freespace(used_space - target_free); + umount(); + if (r != 0) { + derr << __func__ + << " can't migrate, unable to allocate extra space: " + << used_space - target_free << " at target:" << id + << dendl; + return -ENOSPC; + } + + r = _mount_for_bluefs(); + ceph_assert(r == 0); + } else if (target_free < used_space) { + derr << __func__ + << " can't migrate, free space at target: " << target_free + << " is less than required space: " << used_space + << dendl; + return -ENOSPC; + } + r = bluefs->device_migrate_to_existing(cct, devs_source, id); + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + goto shutdown; + } + + if (devs_source.count(BlueFS::BDEV_DB)) { + r = unlink(string(path + "/block.db").c_str()); + ceph_assert(r == 0); + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + r = unlink(string(path + "/block.wal").c_str()); + ceph_assert(r == 0); + } + +shutdown: + _umount_for_bluefs(); + return r; +} + +int BlueStore::migrate_to_new_bluefs_device(const set& devs_source, + int id, + const string& dev_path) +{ + dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl; + int r; + ceph_assert(path_fd < 0); + + ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB); + + if (!cct->_conf->bluestore_bluefs) { + derr << __func__ << " bluefs isn't configured, can't add new device " << dendl; + return -EIO; + } + + r = _mount_for_bluefs(); + + int reserved = 0; + string link_db; + string link_wal; + if (devs_source.count(BlueFS::BDEV_DB) && + bluefs_shared_bdev != BlueFS::BDEV_DB) { + link_db = path + "/block.db"; + } + if (devs_source.count(BlueFS::BDEV_WAL)) { + link_wal = path + "/block.wal"; + } + + size_t target_size; + string target_name; + if (id == BlueFS::BDEV_NEWWAL) { + target_name = "block.wal"; + target_size = cct->_conf->bluestore_block_wal_size; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL), + "bluefs wal", + true); + ceph_assert(r == 0); + } + reserved = BDEV_LABEL_BLOCK_SIZE; + } else if (id == BlueFS::BDEV_NEWDB) { + target_name = "block.db"; + target_size = cct->_conf->bluestore_block_db_size; + + r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path, + cct->_conf->bdev_enable_discard); + ceph_assert(r == 0); + + if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) { + r = _check_or_set_bdev_label( + dev_path, + bluefs->get_block_device_size(BlueFS::BDEV_NEWDB), + "bluefs db", + true); + ceph_assert(r == 0); + } + reserved = SUPER_RESERVED; + } + + bluefs->umount(); + bluefs->mount(); + + bluefs->add_block_extent( + id, reserved, bluefs->get_block_device_size(id) - reserved); + + r = bluefs->device_migrate_to_new(cct, devs_source, id); + + if (r < 0) { + derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl; + goto shutdown; + } + + if (!link_db.empty()) { + r = unlink(link_db.c_str()); + ceph_assert(r == 0); + } + if (!link_wal.empty()) { + r = unlink(link_wal.c_str()); + ceph_assert(r == 0); + } + r = _setup_block_symlink_or_file( + target_name, + dev_path, + target_size, + true); + ceph_assert(r == 0); + dout(0) << __func__ << " success" << dendl; + +shutdown: + _umount_for_bluefs(); + return r; +} + void BlueStore::set_cache_shards(unsigned num) { dout(10) << __func__ << " " << num << dendl; @@ -8509,6 +8792,21 @@ int BlueStore::_open_super_meta() // bluefs alloc if (cct->_conf->bluestore_bluefs) { + { + bluefs_extents.clear(); + bufferlist bl; + db->get(PREFIX_SUPER, "bluefs_extents_back", &bl); + auto p = bl.cbegin(); + try { + decode(bluefs_extents, p); + } + catch (buffer::error& e) { + dout(0) << __func__ << " unable to read bluefs_extents_back" << dendl; + //return -EIO; + } + dout(10) << __func__ << " bluefs_extents_back 0x" << std::hex << bluefs_extents + << std::dec << dendl; + } bluefs_extents.clear(); bufferlist bl; db->get(PREFIX_SUPER, "bluefs_extents", &bl); @@ -8522,6 +8820,7 @@ int BlueStore::_open_super_meta() } dout(10) << __func__ << " bluefs_extents 0x" << std::hex << bluefs_extents << std::dec << dendl; + } // ondisk format diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 098311f32ce..e2d48a95120 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2123,6 +2123,15 @@ private: // its initialization (and outside of _open_bdev) void _validate_bdev(); void _close_bdev(); + + int _open_bluefs(bool create); + void _close_bluefs(); + + // Limited (u)mount intended for BlueFS operations only + int _mount_for_bluefs(); + void _umount_for_bluefs(); + + /* * @warning to_repair_db means that we open this db to repair it, will not * hold the rocksdb's file lock. @@ -2356,6 +2365,13 @@ public: f->close_section(); } + int add_new_bluefs_device(int id, const string& path); + int migrate_to_existing_bluefs_device(const set& devs_source, + int id); + int migrate_to_new_bluefs_device(const set& devs_source, + int id, + const string& path); + public: int statfs(struct store_statfs_t *buf) override; diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc index 0c4ed2fcbae..601b72879e3 100644 --- a/src/os/bluestore/bluestore_tool.cc +++ b/src/os/bluestore/bluestore_tool.cc @@ -88,47 +88,67 @@ const char* find_device_path( return nullptr; } -void add_devices( - BlueFS *fs, +void parse_devices( CephContext *cct, - const vector& devs) + const vector& devs, + map* got, + bool* has_db, + bool* has_wal) { string main; - set got; - for (auto& i : devs) { + bool was_db = false; + if (has_wal) { + *has_wal = false; + } + if (has_db) { + *has_db = false; + } + for (auto& d : devs) { bluestore_bdev_label_t label; - int r = BlueStore::_read_bdev_label(cct, i, &label); + int r = BlueStore::_read_bdev_label(cct, d, &label); if (r < 0) { - cerr << "unable to read label for " << i << ": " + cerr << "unable to read label for " << d << ": " << cpp_strerror(r) << std::endl; exit(EXIT_FAILURE); } int id = -1; if (label.description == "main") - main = i; - else if (label.description == "bluefs db") + main = d; + else if (label.description == "bluefs db") { id = BlueFS::BDEV_DB; - else if (label.description == "bluefs wal") + was_db = true; + if (has_db) { + *has_db = true; + } + } + else if (label.description == "bluefs wal") { id = BlueFS::BDEV_WAL; - if (id >= 0) { - got.insert(id); - cout << " slot " << id << " " << i << std::endl; - int r = fs->add_block_device(id, i, false); - if (r < 0) { - cerr << "unable to open " << i << ": " << cpp_strerror(r) << std::endl; - exit(EXIT_FAILURE); + if (has_wal) { + *has_wal = true; } } + if (id >= 0) { + got->emplace(d, id); + } } if (main.length()) { - int id = BlueFS::BDEV_DB; - if (got.count(BlueFS::BDEV_DB)) - id = BlueFS::BDEV_SLOW; - cout << " slot " << id << " " << main << std::endl; - int r = fs->add_block_device(id, main, false); + int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB; + got->emplace(main, id); + } +} + +void add_devices( + BlueFS *fs, + CephContext *cct, + const vector& devs) +{ + map got; + parse_devices(cct, devs, &got, nullptr, nullptr); + for(auto e : got) { + cout << " slot " << e.second << " " << e.first << std::endl; + int r = fs->add_block_device(e.second, e.first, false); if (r < 0) { - cerr << "unable to open " << main << ": " << cpp_strerror(r) - << std::endl; + cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl; exit(EXIT_FAILURE); } } @@ -185,6 +205,8 @@ int main(int argc, char **argv) { string out_dir; vector devs; + vector devs_source; + string dev_target; string path; string action; string log_file; @@ -199,13 +221,15 @@ int main(int argc, char **argv) ("log-file,l", po::value(&log_file), "log file") ("log-level", po::value(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)") ("dev", po::value>(&devs), "device(s)") + ("devs-source", po::value>(&devs_source), "bluefs-dev-migrate source device(s)") + ("dev-target", po::value(&dev_target), "target/resulting device") ("deep", po::value(&fsck_deep), "deep fsck (read all data)") ("key,k", po::value(&key), "label metadata key name") ("value,v", po::value(&value), "label metadata value") ; po::options_description po_positional("Positional options"); po_positional.add_options() - ("command", po::value(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump") + ("command", po::value(&action), "fsck, repair, bluefs-export, bluefs-bdev-sizes, bluefs-bdev-expand, bluefs-bdev-new-db, bluefs-bdev-new-wal, bluefs-bdev-migrate, show-label, set-label-key, rm-label-key, prime-osd-dir, bluefs-log-dump") ; po::options_description po_all("All options"); po_all.add(po_options).add(po_positional); @@ -292,6 +316,31 @@ int main(int argc, char **argv) } inferring_bluefs_devices(devs, path); } + if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + if (dev_target.empty()) { + cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl; + } + inferring_bluefs_devices(devs, path); + } + if (action == "bluefs-bdev-migrate") { + if (path.empty()) { + cerr << "must specify bluestore path" << std::endl; + exit(EXIT_FAILURE); + } + inferring_bluefs_devices(devs, path); + if (devs_source.size() == 0) { + cerr << "must specify source devices with --devs-source" << std::endl; + exit(EXIT_FAILURE); + } + if (dev_target.empty()) { + cerr << "must specify target device with --dev-target" << std::endl; + exit(EXIT_FAILURE); + } + } vector args; if (log_file.size()) { @@ -592,6 +641,157 @@ int main(int argc, char **argv) delete fs; } else if (action == "bluefs-log-dump") { log_dump(cct.get(), path, devs); + } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") { + map cur_devs_map; + bool need_db = action == "bluefs-bdev-new-db"; + + bool has_wal = false; + bool has_db = false; + parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal); + + if (has_db && has_wal) { + cerr << "can't allocate new device, both WAL and DB exist" + << std::endl; + exit(EXIT_FAILURE); + } else if (need_db && has_db) { + cerr << "can't allocate new DB device, already exists" + << std::endl; + exit(EXIT_FAILURE); + } else if (!need_db && has_wal) { + cerr << "can't allocate new WAL device, already exists" + << std::endl; + exit(EXIT_FAILURE); + } else { + // Create either DB or WAL volume + BlueStore bluestore(cct.get(), path); + + char target_path[PATH_MAX] = ""; + if(!dev_target.empty()) { + if (realpath(dev_target.c_str(), target_path) == nullptr) { + cerr << "failed to retrieve absolute path for " << dev_target + << ": " << cpp_strerror(errno) + << std::endl; + } + } + int r = bluestore.add_new_bluefs_device( + need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL, + target_path); + if (r == 0) { + cout << (need_db ? "DB" : "WAL") << " device added " << target_path + << std::endl; + } else { + cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:" + << cpp_strerror(r) + << std::endl; + } + } + } else if (action == "bluefs-bdev-migrate") { + map cur_devs_map; + set src_dev_ids; + map src_devs; + + + parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr); + for (auto& s : devs_source) { + auto i = cur_devs_map.find(s); + if (i != cur_devs_map.end()) { + src_devs.emplace(*i); + src_dev_ids.emplace(i->second); + } else { + cerr << "can't migrate " << s << ", not a valid bluefs volume " + << std::endl; + exit(EXIT_FAILURE); + } + } + + auto i = cur_devs_map.find(dev_target); + + if (i != cur_devs_map.end()) { + // Migrate to an existing BlueFS volume + + auto dev_target_id = i->second; + if (dev_target_id == BlueFS::BDEV_WAL) { + // currently we're unable to migrate to WAL device since there is no space + // reserved for superblock + cerr << "Migrate to WAL device isn't supported." << std::endl; + exit(EXIT_FAILURE); + } + + bool need_db = dev_target_id == BlueFS::BDEV_NEWDB; + + BlueStore bluestore(cct.get(), path); + int r = bluestore.migrate_to_existing_bluefs_device( + src_dev_ids, + dev_target_id); + if (r == 0) { + for(auto src : src_devs) { + if (src.second != BlueFS::BDEV_SLOW) { + cout << " device removed:" << src.second << " " << src.first + << std::endl; + } + } + } else { + cerr << "failed to migrate to existing BlueFS device: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB) + << " " << dev_target + << cpp_strerror(r) + << std::endl; + } + ceph_assert(r == 0); + } else { + // Migrate to a new BlueFS volume + // via creating either DB or WAL volume + int dev_target_id; + if (src_dev_ids.count(BlueFS::BDEV_DB)) { + // if we have DB device in the source list - we create DB device + // (and may be remove WAL). + dev_target_id = BlueFS::BDEV_NEWDB; + } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) { + dev_target_id = BlueFS::BDEV_NEWWAL; + } else { + cerr << "Unable to migrate Slow volume to new location, " + "please allocate new DB or WAL with " + "--bluefs-bdev-new-db(wal) command" + << std::endl; + exit(EXIT_FAILURE); + } + + BlueStore bluestore(cct.get(), path); + + char target_path[PATH_MAX] = ""; + if(!dev_target.empty()) { + if (realpath(dev_target.c_str(), target_path) == nullptr) { + cerr << "failed to retrieve absolute path for " << dev_target + << ": " << cpp_strerror(errno) + << std::endl; + } + } + bool need_db = dev_target_id == BlueFS::BDEV_NEWDB; + int r = bluestore.migrate_to_new_bluefs_device( + src_dev_ids, + dev_target_id, + target_path); + if (r == 0) { + for(auto src : src_devs) { + if (src.second != BlueFS::BDEV_SLOW) { + cout << " device removed:" << src.second << " " << src.first + << std::endl; + } + } + cout << " device added: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB) + << " " << target_path + << std::endl; + } else { + cerr << "failed to migrate to new BlueFS device: " + << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB) + << " " << target_path + << cpp_strerror(r) + << std::endl; + } + + ceph_assert(r == 0); + } } else { cerr << "unrecognized action " << action << std::endl; return 1;