From 012c5c39e9a89290e4b5f573bf14190140c141ec Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Thu, 4 Jul 2019 17:19:36 +0300 Subject: [PATCH] os/bluestore: introduce volume selector abstraction to BlueFS Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueFS.cc | 160 +++++++++++++++++++++++-------- src/os/bluestore/BlueFS.h | 58 ++++++++++- src/os/bluestore/bluefs_types.cc | 4 +- src/os/bluestore/bluefs_types.h | 7 +- 4 files changed, 182 insertions(+), 47 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 39291dd6f7edb..a8dee6fccc760 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -121,6 +121,8 @@ BlueFS::BlueFS(CephContext* cct) discard_cb[BDEV_DB] = db_discard_cb; discard_cb[BDEV_SLOW] = slow_discard_cb; asok_hook = SocketHook::create(this); + // set default volume selector + vselector.reset(new OriginalVolumeSelector(this)); } BlueFS::~BlueFS() @@ -394,6 +396,7 @@ void BlueFS::dump_perf_counters(Formatter *f) void BlueFS::dump_block_extents(ostream& out) { + vselector->dump(cct); for (unsigned i = 0; i < MAX_BDEV; ++i) { if (!bdev[i]) { continue; @@ -461,11 +464,12 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) // init log FileRef log_file = ceph::make_ref(); log_file->fnode.ino = 1; - log_file->fnode.prefer_bdev = BDEV_WAL; + log_file->vselector_hint = vselector->get_hint_by_device(BDEV_WAL); int r = _allocate( - log_file->fnode.prefer_bdev, + vselector->select_prefer_bdev(log_file->vselector_hint), cct->_conf->bluefs_max_log_runway, &log_file->fnode); + vselector->add_usage(log_file->vselector_hint, log_file->fnode); ceph_assert(r == 0); log_writer = _create_writer(log_file); @@ -589,6 +593,7 @@ int BlueFS::mount() _stop_alloc(); goto out; } + vselector->dump(cct); // init freelist for (auto& p : file_map) { @@ -645,6 +650,8 @@ void BlueFS::umount() super = bluefs_super_t(); log_t.clear(); _shutdown_logger(); + + vselector->dump(cct); } int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout) @@ -813,6 +820,8 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (!noop) { log_file->fnode = super.log_fnode; + log_file->vselector_hint = + vselector->get_hint_by_device(BDEV_WAL); } else { // do not use fnode from superblock in 'noop' mode - log_file's one should // be fine and up-to-date @@ -1141,6 +1150,12 @@ int BlueFS::_replay(bool noop, bool to_stdout) ceph_assert(q != dir_map.end()); map::iterator r = q->second->file_map.find(filename); ceph_assert(r == q->second->file_map.end()); + + vselector->sub_usage(file->vselector_hint, file->fnode); + file->vselector_hint = + vselector->get_hint_by_dir(dirname); + vselector->add_usage(file->vselector_hint, file->fnode); + q->second->file_map[filename] = file; ++file->refs; } @@ -1217,7 +1232,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) bluefs_fnode_t fnode; decode(fnode, p); dout(20) << __func__ << " 0x" << std::hex << pos << std::dec - << ": op_file_update " << " " << fnode << dendl; + << ": op_file_update " << " " << fnode << " " << dendl; if (unlikely(to_stdout)) { std::cout << " 0x" << std::hex << pos << std::dec << ": op_file_update " << " " << fnode << std::endl; @@ -1225,6 +1240,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (!noop) { FileRef f = _get_file(fnode.ino); +<<<<<<< HEAD if (cct->_conf->bluefs_log_replay_check_allocations) { // check initial log layout if (first_log_check) { @@ -1249,6 +1265,16 @@ int BlueFS::_replay(bool noop, bool to_stdout) } f->fnode = fnode; +======= + if (fnode.ino != 1) { + vselector->sub_usage(f->vselector_hint, f->fnode); + } + f->fnode = fnode; + if (fnode.ino != 1) { + vselector->add_usage(f->vselector_hint, f->fnode); + } + +>>>>>>> os/bluestore: introduce volume selector abstraction to BlueFS if (fnode.ino > ino_last) { ino_last = fnode.ino; } @@ -1277,6 +1303,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) if (!noop) { auto p = file_map.find(ino); ceph_assert(p != file_map.end()); + vselector->sub_usage(p->second->vselector_hint, p->second->fnode); if (cct->_conf->bluefs_log_replay_check_allocations) { auto& fnode_extents = p->second->fnode.extents; for (auto e : fnode_extents) { @@ -1334,6 +1361,7 @@ int BlueFS::_replay(bool noop, bool to_stdout) ++log_seq; log_file->fnode.size = log_reader->buf.pos; } + vselector->add_usage(log_file->vselector_hint, log_file->fnode); if (!noop && first_log_check && cct->_conf->bluefs_log_replay_check_allocations) { @@ -1487,12 +1515,6 @@ int BlueFS::device_migrate_to_existing( } } } - auto& prefer_bdev = file_ref->fnode.prefer_bdev; - if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { - dout(20) << __func__ << " " << " ... adjusting prefer_bdev " - << prefer_bdev << " -> " << dev_target_new << dendl; - prefer_bdev = dev_target_new; - } } // new logging device in the current naming scheme int new_log_dev_cur = bdev[BDEV_WAL] ? @@ -1546,7 +1568,7 @@ int BlueFS::device_migrate_to_new( (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) : 0; flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0; - int dev_target_new = dev_target; + int dev_target_new = dev_target; //FIXME: remove, makes no sense for (auto& p : file_map) { //do not copy log @@ -1621,12 +1643,6 @@ int BlueFS::device_migrate_to_new( fnode_extents.emplace_back(dev_target_new, i.offset, i.length); } } - auto& prefer_bdev = p.second->fnode.prefer_bdev; - if (prefer_bdev != dev_target && devs_source.count(prefer_bdev)) { - dout(20) << __func__ << " " << " ... adjusting prefer_bdev " - << prefer_bdev << " -> " << dev_target_new << dendl; - prefer_bdev = dev_target_new; - } } // new logging device in the current naming scheme int new_log_dev_cur = @@ -1689,6 +1705,7 @@ void BlueFS::_drop_link(FileRef file) if (file->refs == 0) { dout(20) << __func__ << " destroying " << file->fnode << dendl; ceph_assert(file->num_reading.load() == 0); + vselector->sub_usage(file->vselector_hint, file->fnode); log_t.op_file_remove(file->fnode.ino); for (auto& r : file->fnode.extents) { pending_release[r.bdev].insert(r.offset, r.length); @@ -2040,10 +2057,12 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t, void BlueFS::_compact_log_sync() { dout(10) << __func__ << dendl; + auto prefer_bdev = + vselector->select_prefer_bdev(log_writer->file->vselector_hint); _rewrite_log_and_layout_sync(true, BDEV_DB, - log_writer->file->fnode.prefer_bdev, - log_writer->file->fnode.prefer_bdev, + prefer_bdev, + prefer_bdev, 0, super.memorized_layout); logger->inc(l_bluefs_log_compactions); @@ -2100,6 +2119,9 @@ void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback, _close_writer(log_writer); log_file->fnode.size = bl.length(); + vselector->sub_usage(log_file->vselector_hint, old_fnode); + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + log_writer = _create_writer(log_file); log_writer->append(bl); r = _flush(log_writer, true); @@ -2177,13 +2199,18 @@ void BlueFS::_compact_log_async(std::unique_lock& l) log_cond.wait(l); } + vselector->sub_usage(log_file->vselector_hint, log_file->fnode); + // 1. allocate new log space and jump to it. old_log_jump_to = log_file->fnode.get_allocated(); dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl; - int r = _allocate(log_file->fnode.prefer_bdev, - cct->_conf->bluefs_max_log_runway, &log_file->fnode); + int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint), + cct->_conf->bluefs_max_log_runway, + &log_file->fnode); ceph_assert(r == 0); + //adjust usage as flush below will need it + vselector->add_usage(log_file->vselector_hint, log_file->fnode); dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; // update the log file change and log a jump to the offset where we want to @@ -2194,6 +2221,7 @@ void BlueFS::_compact_log_async(std::unique_lock& l) flush_bdev(); // FIXME? _flush_and_sync_log(l, 0, old_log_jump_to); + vselector->sub_usage(log_file->vselector_hint, log_file->fnode); // 2. prepare compacted log bluefs_transaction_t t; @@ -2211,6 +2239,7 @@ void BlueFS::_compact_log_async(std::unique_lock& l) t.op_jump(log_seq, new_log_jump_to); // allocate + //FIXME: check if we want DB here? r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, &new_log->fnode); ceph_assert(r == 0); @@ -2237,6 +2266,7 @@ void BlueFS::_compact_log_async(std::unique_lock& l) // 5. update our log fnode // discard first old_log_jump_to extents + dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec << " of " << log_file->fnode.extents << dendl; uint64_t discarded = 0; @@ -2275,6 +2305,8 @@ void BlueFS::_compact_log_async(std::unique_lock& l) log_writer->pos = log_writer->file->fnode.size = log_writer->pos - old_log_jump_to + new_log_jump_to; + vselector->add_usage(log_file->vselector_hint, log_file->fnode); + // 6. write the super block to reflect the changes dout(10) << __func__ << " writing super" << dendl; super.log_fnode = log_file->fnode; @@ -2376,10 +2408,13 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, dout(10) << __func__ << " waiting for async compaction" << dendl; log_cond.wait(l); } - int r = _allocate(log_writer->file->fnode.prefer_bdev, - cct->_conf->bluefs_max_log_runway, - &log_writer->file->fnode); + vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode); + int r = _allocate( + vselector->select_prefer_bdev(log_writer->file->vselector_hint), + cct->_conf->bluefs_max_log_runway, + &log_writer->file->fnode); ceph_assert(r == 0); + vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode); log_t.op_file_update(log_writer->file->fnode); } @@ -2406,7 +2441,9 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, dout(10) << __func__ << " jumping log offset from 0x" << std::hex << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl; log_writer->pos = jump_to; + vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); log_writer->file->fnode.size = jump_to; + vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size); } _flush_bdev_safely(log_writer); @@ -2495,7 +2532,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) ceph_assert(offset <= h->file->fnode.size); uint64_t allocated = h->file->fnode.get_allocated(); - + vselector->sub_usage(h->file->vselector_hint, h->file->fnode); // do not bother to dirty the file if we are overwriting // previously allocated extents. bool must_dirty = false; @@ -2504,13 +2541,14 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) // we should never run out of log space here; see the min runway check // in _flush_and_sync_log. ceph_assert(h->file->fnode.ino != 1); - int r = _allocate(h->file->fnode.prefer_bdev, + int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint), offset + length - allocated, &h->file->fnode); if (r < 0) { derr << __func__ << " allocated: 0x" << std::hex << allocated << " offset: 0x" << offset << " length: 0x" << length << std::dec << dendl; + vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo ceph_abort_msg("bluefs enospc"); return r; } @@ -2667,6 +2705,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) } } } + vselector->add_usage(h->file->vselector_hint, h->file->fnode); dout(20) << __func__ << " h " << h << " pos now 0x" << std::hex << h->pos << std::dec << dendl; return 0; @@ -2760,7 +2799,9 @@ int BlueFS::_truncate(FileWriter *h, uint64_t offset) ceph_abort_msg("truncate up not supported"); } ceph_assert(h->file->fnode.size >= offset); + vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size); h->file->fnode.size = offset; + vselector->add_usage(h->file->vselector_hint, h->file->fnode.size); log_t.op_file_update(h->file->fnode); return 0; } @@ -2981,7 +3022,12 @@ int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len) uint64_t allocated = f->fnode.get_allocated(); if (off + len > allocated) { uint64_t want = off + len - allocated; - int r = _allocate(f->fnode.prefer_bdev, want, &f->fnode); + vselector->sub_usage(f->vselector_hint, f->fnode); + + int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint), + want, + &f->fnode); + vselector->add_usage(f->vselector_hint, f->fnode); if (r < 0) return r; log_t.op_file_update(f->fnode); @@ -3057,6 +3103,7 @@ int BlueFS::open_for_write( dout(20) << __func__ << " dir " << dirname << " (" << dir << ") file " << filename << " already exists, truncate + overwrite" << dendl; + vselector->sub_usage(file->vselector_hint, file->fnode); file->fnode.size = 0; for (auto& p : file->fnode.extents) { pending_release[p.bdev].insert(p.offset, p.length); @@ -3068,20 +3115,11 @@ int BlueFS::open_for_write( ceph_assert(file->fnode.ino > 1); file->fnode.mtime = ceph_clock_now(); - file->fnode.prefer_bdev = BlueFS::BDEV_DB; - if (dirname.length() > 5) { - // the "db.slow" and "db.wal" directory names are hard-coded at - // match up with bluestore. the slow device is always the second - // one (when a dedicated block.db device is present and used at - // bdev 0). the wal device is always last. - if (boost::algorithm::ends_with(dirname, ".slow")) { - file->fnode.prefer_bdev = BlueFS::BDEV_SLOW; - } else if (boost::algorithm::ends_with(dirname, ".wal")) { - file->fnode.prefer_bdev = BlueFS::BDEV_WAL; - } - } + file->vselector_hint = vselector->get_hint_by_dir(dirname); + dout(20) << __func__ << " mapping " << dirname << "/" << filename - << " to bdev " << (int)file->fnode.prefer_bdev << dendl; + << " vsel_hint " << file->vselector_hint + << dendl; log_t.op_file_update(file->fnode); if (create) @@ -3405,3 +3443,47 @@ void BlueFS::debug_inject_duplicate_gift(unsigned id, } } +// =============================================== +// OriginalVolumeSelector + +void* OriginalVolumeSelector::get_hint_by_device(uint8_t dev) const { + return reinterpret_cast(dev); +} +void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const { + uint8_t res = BlueFS::BDEV_DB; + if (dirname.length() > 5) { + // the "db.slow" and "db.wal" directory names are hard-coded at + // match up with bluestore. the slow device is always the second + // one (when a dedicated block.db device is present and used at + // bdev 0). the wal device is always last. + if (boost::algorithm::ends_with(dirname, ".slow")) { + res = BlueFS::BDEV_SLOW; + } + else if (boost::algorithm::ends_with(dirname, ".wal")) { + res = BlueFS::BDEV_WAL; + } + } + return reinterpret_cast(res); +} + +uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint) +{ + return (uint8_t)(reinterpret_cast(hint)); +} + +void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const +{ + // we have both block.db and block; tell rocksdb! + // note: the second (last) size value doesn't really matter + uint64_t db_size = bluefs->get_block_device_size(BlueFS::BDEV_DB); + uint64_t slow_size = bluefs->get_block_device_size(BlueFS::BDEV_SLOW); + res.emplace_back(base, (uint64_t)(db_size * 95 / 100)); + res.emplace_back(base + ".slow", (uint64_t)(slow_size * 95 / 100)); +} + +#undef dout_prefix +#define dout_prefix *_dout << "OriginalVolumeSelector: " + +void OriginalVolumeSelector::dump(CephContext* c) { + ldout(c, 1) << "OriginalVolumeSelector" << dendl; +} diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 8b786684d7132..7ad3eae316d49 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -75,6 +75,50 @@ public: virtual size_t available_freespace(uint64_t alloc_size) = 0; }; +class BlueFSVolumeSelector { +public: + typedef std::vector> paths; + + virtual ~BlueFSVolumeSelector() { + } + virtual void* get_hint_by_device(uint8_t dev) const = 0; + virtual void* get_hint_by_dir(const string& dirname) const = 0; + + virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; + virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0; + virtual void add_usage(void* file_hint, uint64_t fsize) = 0; + virtual void sub_usage(void* file_hint, uint64_t fsize) = 0; + virtual uint8_t select_prefer_bdev(void* hint) = 0; + virtual void get_paths(const std::string& base, paths& res) const = 0; + virtual void dump(CephContext* cct) = 0; +}; +class BlueFS; +class OriginalVolumeSelector : public BlueFSVolumeSelector { + BlueFS* bluefs = nullptr; +public: + OriginalVolumeSelector(BlueFS* _bluefs) : bluefs(_bluefs) {} + + void* get_hint_by_device(uint8_t dev) const override; + void* get_hint_by_dir(const string& dirname) const override; + + void add_usage(void* file_hint, const bluefs_fnode_t& fnode) override { + // do nothing + } + void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) override { + // do nothing + } + void add_usage(void* file_hint, uint64_t fsize) override { + // do nothing + } + void sub_usage(void* file_hint, uint64_t fsize) override { + // do nothing + } + uint8_t select_prefer_bdev(void* hint) override; + void get_paths(const std::string& base, paths& res) const override; + void dump(CephContext* cct) override; + +}; + class BlueFS { public: CephContext* cct; @@ -104,6 +148,8 @@ public: std::atomic_int num_readers, num_writers; std::atomic_int num_reading; + void* vselector_hint = nullptr; + private: FRIEND_MAKE_REF(File); File() @@ -114,7 +160,8 @@ public: deleted(false), num_readers(0), num_writers(0), - num_reading(0) + num_reading(0), + vselector_hint(nullptr) {} ~File() override { ceph_assert(num_readers.load() == 0); @@ -302,6 +349,7 @@ private: BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev BlueFSDeviceExpander* slow_dev_expander = nullptr; + std::unique_ptr vselector; class SocketHook; SocketHook* asok_hook = nullptr; @@ -490,6 +538,14 @@ public: void set_slow_device_expander(BlueFSDeviceExpander* a) { slow_dev_expander = a; } + void set_volume_selector(BlueFSVolumeSelector* s) { + vselector.reset(s); + } + void get_vselector_paths(const std::string& base, + BlueFSVolumeSelector::paths& res) const { + return vselector->get_paths(base, res); + } + int add_block_device(unsigned bdev, const string& path, bool trim, bool shared_with_bluestore=false); bool bdev_support_label(unsigned id); diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index eb51eca0ac099..4ef55e4b557fc 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -146,7 +146,6 @@ void bluefs_fnode_t::dump(Formatter *f) const f->dump_unsigned("ino", ino); f->dump_unsigned("size", size); f->dump_stream("mtime") << mtime; - f->dump_unsigned("prefer_bdev", prefer_bdev); f->open_array_section("extents"); for (auto& p : extents) f->dump_object("extent", p); @@ -161,7 +160,7 @@ void bluefs_fnode_t::generate_test_instances(list& ls) ls.back()->size = 1048576; ls.back()->mtime = utime_t(123,45); ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096)); - ls.back()->prefer_bdev = 1; + ls.back()->__unused__ = 1; } ostream& operator<<(ostream& out, const bluefs_fnode_t& file) @@ -169,7 +168,6 @@ ostream& operator<<(ostream& out, const bluefs_fnode_t& file) return out << "file(ino " << file.ino << " size 0x" << std::hex << file.size << std::dec << " mtime " << file.mtime - << " bdev " << (int)file.prefer_bdev << " allocated " << std::hex << file.allocated << std::dec << " extents " << file.extents << ")"; diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index d841264cb13ae..1b1a60c59146e 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -35,12 +35,11 @@ WRITE_CLASS_DENC(bluefs_extent_t) ostream& operator<<(ostream& out, const bluefs_extent_t& e); - struct bluefs_fnode_t { uint64_t ino; uint64_t size; utime_t mtime; - uint8_t prefer_bdev; + uint8_t __unused__; // was prefer_bdev mempool::bluefs::vector extents; // precalculated logical offsets for extents vector entries @@ -49,7 +48,7 @@ struct bluefs_fnode_t { uint64_t allocated; - bluefs_fnode_t() : ino(0), size(0), prefer_bdev(0), allocated(0) {} + bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0) {} uint64_t get_allocated() const { return allocated; @@ -83,7 +82,7 @@ struct bluefs_fnode_t { denc_varint(v.ino, p); denc_varint(v.size, p); denc(v.mtime, p); - denc(v.prefer_bdev, p); + denc(v.__unused__, p); denc(v.extents, p); DENC_FINISH(p); } -- 2.39.5