From 9b1559b8dc25cf864626a7db5b25640292db2536 Mon Sep 17 00:00:00 2001 From: Varada Kari Date: Fri, 22 Jul 2016 16:22:33 +0530 Subject: [PATCH] os/bluestore/BlueFS: Add sync and async compaction Patch contains implementation for sync and async log compaction, deafulted to async compaction. Signed-off-by: Sage Weil Signed-off-by: Varada Kari --- src/common/config_opts.h | 1 + src/os/bluestore/BlueFS.cc | 182 ++++++++++++++++++++++++++++++++++--- src/os/bluestore/BlueFS.h | 7 +- 3 files changed, 175 insertions(+), 15 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index cf70130283ecf..d1f4af99bf19c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -932,6 +932,7 @@ OPTION(bluefs_max_log_runway, OPT_U64, 4194304) // alloc this much at a time OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT, 5.0) // before we consider OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576) // before we consider OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this big +OPTION(bluefs_compact_log_sync, OPT_BOOL, false) // sync or async log compaction? OPTION(bluestore_bluefs, OPT_BOOL, true) OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 5fab4550efe23..3d9a787145869 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -973,7 +973,6 @@ uint64_t BlueFS::_estimate_log_size() return ROUND_UP_TO(size, super.block_size); } - bool BlueFS::_should_compact_log() { uint64_t current = log_writer->file->fnode.size; @@ -1023,10 +1022,6 @@ void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t) void BlueFS::_compact_log_sync() { - // FIXME: we currently hold the lock while writing out the compacted log, - // which may mean a latency spike. we could drop the lock while writing out - // the big compacted log, while continuing to log at the end of the old log - // file, and once it's done swap out the old log extents for the new ones. dout(10) << __func__ << dendl; File *log_file = log_writer->file.get(); @@ -1078,6 +1073,153 @@ void BlueFS::_compact_log_sync() logger->inc(l_bluefs_log_compactions); } +/* + * 1. Allocate a new extent to continue the log, and then log an event + * that jumps the log write position to the new extent. At this point, the + * old extent(s) won't be written to, and reflect everything to compact. + * New events will be written to the new region that we'll keep. + * + * 2. While still holding the lock, encode a bufferlist that dumps all of the + * in-memory fnodes and names. This will become the new beginning of the + * log. The last event will jump to the log continuation extent from #1. + * + * 3. Queue a write to a new extent for the new beginnging of the log. + * + * 4. Drop lock and wait + * + * 5. Retake the lock. + * + * 6. Update the log_fnode to splice in the new beginning. + * + * 7. Write the new superblock. + * + * 8. Release the old log space. Clean up. + */ +void BlueFS::_compact_log_async(std::unique_lock& l) +{ + dout(10) << __func__ << dendl; + File *log_file = log_writer->file.get(); + + // 1. allocate new log space and jump to it. + old_log_jump_to = log_file->fnode.get_allocated(); + uint64_t need = old_log_jump_to + g_conf->bluefs_max_log_runway; + dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to + << " need 0x" << need << std::dec << dendl; + while (log_file->fnode.get_allocated() < need) { + int r = _allocate(log_file->fnode.prefer_bdev, + g_conf->bluefs_max_log_runway, + &log_file->fnode.extents); + assert(r == 0); + } + dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; + + // update the log file change and log a jump to the offset where we want to + // write the new entries + log_t.op_file_update(log_file->fnode); + log_t.op_jump(log_seq, old_log_jump_to); + _flush_and_sync_log(l, 0, old_log_jump_to); + + // 2. prepare compacted log + bluefs_transaction_t t; + _compact_log_dump_metadata(&t); + + bufferlist bl; + ::encode(t, bl); + _pad_bl(bl); + + new_log_jump_to = ROUND_UP_TO(bl.length() + super.block_size, + g_conf->bluefs_alloc_size); + bluefs_transaction_t t2; + t2.op_jump(log_seq, new_log_jump_to); + ::encode(t2, bl); + _pad_bl(bl); + dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to + << std::dec << dendl; + + // create a new log [writer] + new_log = new File; + new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode + int r = _allocate(BlueFS::BDEV_DB, new_log_jump_to, + &new_log->fnode.extents); + assert(r == 0); + new_log_writer = _create_writer(new_log); + new_log_writer->append(bl); + + // 3. flush + _flush(new_log_writer, true); + lock.unlock(); + + // 4. wait + dout(10) << __func__ << " waiting for compacted log to sync" << dendl; + wait_for_aio(new_log_writer); + flush_bdev(); + + // 5. retake lock + lock.lock(); + + // 6. update our log fnode + // discard first old_log_jump_to extents + dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec + << " of " << log_file->fnode.extents << dendl; + uint64_t discarded = 0; + vector old_extents; + while (discarded < old_log_jump_to) { + bluefs_extent_t& e = log_file->fnode.extents.front(); + bluefs_extent_t temp = e; + if (discarded + e.length <= old_log_jump_to) { + dout(10) << __func__ << " remove old log extent " << e << dendl; + discarded += e.length; + log_file->fnode.extents.erase(log_file->fnode.extents.begin()); + } else { + dout(10) << __func__ << " remove front of old log extent " << e << dendl; + uint64_t drop = old_log_jump_to - discarded; + temp.length = drop; + e.offset += drop; + e.length -= drop; + discarded += drop; + dout(10) << __func__ << " kept " << e << " removed " << temp << dendl; + } + old_extents.push_back(temp); + } + new_log->fnode.extents.insert(new_log->fnode.extents.end(), + log_file->fnode.extents.begin(), + log_file->fnode.extents.end()); + + // clear the extents from old log file, they are added to new log + log_file->fnode.extents.clear(); + + // swap the log files. New log file is the log file now. + log_file->fnode.extents.swap(new_log->fnode.extents); + log_writer->pos = log_writer->file->fnode.size = + log_writer->pos - old_log_jump_to + new_log_jump_to; + + // 7. write the super block to reflect the changes + dout(10) << __func__ << " writing super" << dendl; + super.log_fnode = log_file->fnode; + ++super.version; + _write_super(); + + lock.unlock(); + flush_bdev(); + lock.lock(); + + // 8. release old space + dout(10) << __func__ << " release old log extents " << old_extents << dendl; + for (auto& r : old_extents) { + alloc[r.bdev]->release(r.offset, r.length); + } + + // delete the new log, remove from the dirty files list + _close_writer(new_log_writer); + dirty_files.erase(dirty_files.iterator_to(*new_log)); + new_log_writer = nullptr; + new_log = nullptr; + log_cond.notify_all(); + + dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl; + logger->inc(l_bluefs_log_compactions); +} + void BlueFS::_pad_bl(bufferlist& bl) { uint64_t partial = bl.length() % super.block_size; @@ -1118,6 +1260,10 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, if (runway < g_conf->bluefs_min_log_runway) { dout(10) << __func__ << " allocating more log runway (0x" << std::hex << runway << std::dec << " remaining)" << dendl; + while (new_log_writer) { + dout(10) << __func__ << " waiting for async compaction" << dendl; + log_cond.wait(l); + } int r = _allocate(log_writer->file->fnode.prefer_bdev, g_conf->bluefs_max_log_runway, &log_writer->file->fnode.extents); @@ -1167,19 +1313,18 @@ int BlueFS::_flush_and_sync_log(std::unique_lock& l, File *file = &(*p); assert(file->dirty_seq > 0); if (file->dirty_seq <= log_seq_stable) { - dout(20) << __func__ << " cleaned file " << file->fnode << dendl; - file->dirty_seq = 0; - dirty_files.erase(p++); + dout(20) << __func__ << " cleaned file " << file->fnode << dendl; + file->dirty_seq = 0; + dirty_files.erase(p++); } else { - ++p; + ++p; } } } else { dout(20) << __func__ << " log_seq_stable " << log_seq_stable - << " already > out seq " << seq - << ", we lost a race against another log flush, done" << dendl; + << " already > out seq " << seq + << ", we lost a race against another log flush, done" << dendl; } - _update_logger_stats(); return 0; @@ -1216,8 +1361,11 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) int r = _allocate(h->file->fnode.prefer_bdev, offset + length - allocated, &h->file->fnode.extents); - if (r < 0) + if (r < 0) { + derr << __func__ << " allocated: " << allocated << \ + " offset: " << offset << " length: " << length << dendl; return r; + } must_dirty = true; } if (h->file->fnode.size < offset + length) { @@ -1523,9 +1671,15 @@ void BlueFS::sync_metadata() p->commit_finish(); } } + if (_should_compact_log()) { - _compact_log_sync(); + if (g_conf->bluefs_compact_log_sync) { + _compact_log_sync(); + } else { + _compact_log_async(l); + } } + utime_t end = ceph_clock_now(NULL); utime_t dur = end - start; dout(10) << __func__ << " done in " << dur << dendl; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 0e45f9adf86ad..dd8b5786c37d4 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -207,6 +207,11 @@ private: bool log_flushing = false; ///< true while flushing the log std::condition_variable log_cond; + uint64_t new_log_jump_to = 0; + uint64_t old_log_jump_to = 0; + FileRef new_log = nullptr; + FileWriter *new_log_writer = nullptr; + /* * There are up to 3 block devices: * @@ -245,6 +250,7 @@ private: bool _should_compact_log(); void _compact_log_dump_metadata(bluefs_transaction_t *t); void _compact_log_sync(); + void _compact_log_async(std::unique_lock& l); //void _aio_finish(void *priv); @@ -292,7 +298,6 @@ public: int mkfs(uuid_d osd_uuid); int mount(); void umount(); - void dump_logfile(ostream &out); int fsck(); -- 2.39.5