From 60fdfa3987aaafd8c52bf7220a3c587d6f0cc4e9 Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Fri, 16 Jun 2017 22:29:06 +0800 Subject: [PATCH] os/bluestore: refactor BlueStore::_do_write Signed-off-by: xie xingguo --- src/os/bluestore/BlueStore.cc | 234 ++++++++++++++++++++-------------- src/os/bluestore/BlueStore.h | 13 ++ 2 files changed, 150 insertions(+), 97 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 63eaab41a86..357c848d05f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -9969,68 +9969,41 @@ void BlueStore::_do_write_data( } } -int BlueStore::_do_write( - TransContext *txc, - CollectionRef& c, - OnodeRef o, - uint64_t offset, - uint64_t length, - bufferlist& bl, - uint32_t fadvise_flags) +void BlueStore::_choose_write_options( + CollectionRef& c, + OnodeRef o, + uint32_t fadvise_flags, + WriteContext *wctx) { - int r = 0; - - dout(20) << __func__ - << " " << o->oid - << " 0x" << std::hex << offset << "~" << length - << " - have 0x" << o->onode.size - << " (" << std::dec << o->onode.size << ")" - << " bytes" - << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec - << dendl; - _dump_onode(o); - - if (length == 0) { - return 0; - } - - uint64_t end = offset + length; - bool was_gc = false; - GarbageCollector gc(c->store->cct); - int64_t benefit; - auto dirty_start = offset; - auto dirty_end = offset + length; - - WriteContext wctx, wctx_gc; if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { dout(20) << __func__ << " will do buffered write" << dendl; - wctx.buffered = true; + wctx->buffered = true; } else if (cct->_conf->bluestore_default_buffered_write && (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { dout(20) << __func__ << " defaulting to buffered write" << dendl; - wctx.buffered = true; + wctx->buffered = true; } - // FIXME: Using the MAX of the block_size_order and preferred_csum_order - // results in poor small random read performance when data was initially - // written out in large chunks. Reverting to previous behavior for now. - wctx.csum_order = block_size_order; + // apply basic csum block size + wctx->csum_order = block_size_order; // compression parameters unsigned alloc_hints = o->onode.alloc_hint_flags; auto cm = select_option( "compression_mode", - comp_mode.load(), + comp_mode.load(), [&]() { string val; if(c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) { - return boost::optional(Compressor::get_comp_mode_type(val)); + return boost::optional( + Compressor::get_comp_mode_type(val)); } return boost::optional(); } ); - wctx.compress = (cm != Compressor::COMP_NONE) && + + wctx->compress = (cm != Compressor::COMP_NONE) && ((cm == Compressor::COMP_FORCE) || (cm == Compressor::COMP_AGGRESSIVE && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) || @@ -10039,22 +10012,24 @@ int BlueStore::_do_write( if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 && - (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE| - CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) && + (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE | + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) && (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) { + dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl; + auto order = min_alloc_size_order.load(); if (o->onode.expected_write_size) { - wctx.csum_order = std::max(order, - (uint8_t)ctz(o->onode.expected_write_size)); + wctx->csum_order = std::max(order, + (uint8_t)ctz(o->onode.expected_write_size)); } else { - wctx.csum_order = order; + wctx->csum_order = order; } - if (wctx.compress) { - wctx.target_blob_size = select_option( + if (wctx->compress) { + wctx->target_blob_size = select_option( "compression_max_blob_size", - comp_max_blob_size.load(), + comp_max_blob_size.load(), [&]() { int val; if(c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) { @@ -10065,10 +10040,10 @@ int BlueStore::_do_write( ); } } else { - if (wctx.compress) { - wctx.target_blob_size = select_option( + if (wctx->compress) { + wctx->target_blob_size = select_option( "compression_min_blob_size", - comp_min_blob_size.load(), + comp_min_blob_size.load(), [&]() { int val; if(c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) { @@ -10079,25 +10054,107 @@ int BlueStore::_do_write( ); } } + uint64_t max_bsize = max_blob_size.load(); - if (wctx.target_blob_size == 0 || wctx.target_blob_size > max_bsize) { - wctx.target_blob_size = max_bsize; + if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) { + wctx->target_blob_size = max_bsize; } + // set the min blob size floor at 2x the min_alloc_size, or else we // won't be able to allocate a smaller extent for the compressed // data. - if (wctx.compress && - wctx.target_blob_size < min_alloc_size * 2) { - wctx.target_blob_size = min_alloc_size * 2; + if (wctx->compress && + wctx->target_blob_size < min_alloc_size * 2) { + wctx->target_blob_size = min_alloc_size * 2; } + + dout(20) << __func__ << " prefer csum_order " << wctx->csum_order + << " target_blob_size 0x" << std::hex << wctx->target_blob_size + << std::dec << dendl; +} + +int BlueStore::_do_gc( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + const GarbageCollector& gc, + const WriteContext& wctx, + uint64_t *dirty_start, + uint64_t *dirty_end) +{ + auto& extents_to_collect = gc.get_extents_to_collect(); + + WriteContext wctx_gc; wctx_gc.fork(wctx); // make a clone for garbage collection - dout(20) << __func__ << " prefer csum_order " << wctx.csum_order - << " target_blob_size 0x" << std::hex << wctx.target_blob_size - << std::dec << dendl; + for (auto it = extents_to_collect.begin(); + it != extents_to_collect.end(); + ++it) { + bufferlist bl; + int r = _do_read(c.get(), o, it->offset, it->length, bl, 0); + assert(r == (int)it->length); + + o->extent_map.fault_range(db, it->offset, it->length); + _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc); + logger->inc(l_bluestore_gc_merged, it->length); + + if (*dirty_start > it->offset) { + *dirty_start = it->offset; + } + + if (*dirty_end < it->offset + it->length) { + *dirty_end = it->offset + it->length; + } + } + + dout(30) << __func__ << " alloc write" << dendl; + int r = _do_alloc_write(txc, c, o, &wctx_gc); + if (r < 0) { + derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) + << dendl; + return r; + } + + _wctx_finish(txc, c, o, &wctx_gc); + return 0; +} + +int BlueStore::_do_write( + TransContext *txc, + CollectionRef& c, + OnodeRef o, + uint64_t offset, + uint64_t length, + bufferlist& bl, + uint32_t fadvise_flags) +{ + int r = 0; + + dout(20) << __func__ + << " " << o->oid + << " 0x" << std::hex << offset << "~" << length + << " - have 0x" << o->onode.size + << " (" << std::dec << o->onode.size << ")" + << " bytes" + << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec + << dendl; + _dump_onode(o); + + if (length == 0) { + return 0; + } + + uint64_t end = offset + length; + + GarbageCollector gc(c->store->cct); + int64_t benefit; + auto dirty_start = offset; + auto dirty_end = end; + + WriteContext wctx; + _choose_write_options(c, o, fadvise_flags, &wctx); o->extent_map.fault_range(db, offset, length); _do_write_data(txc, c, o, offset, length, bl, &wctx); - r = _do_alloc_write(txc, c, o, &wctx); if (r < 0) { derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) @@ -10105,54 +10162,37 @@ int BlueStore::_do_write( goto out; } + // NB: _wctx_finish() will empty old_extents + // so we must do gc estimation before that benefit = gc.estimate(offset, - length, - o->extent_map, - wctx.old_extents, - min_alloc_size); + length, + o->extent_map, + wctx.old_extents, + min_alloc_size); _wctx_finish(txc, c, o, &wctx); if (end > o->onode.size) { dout(20) << __func__ << " extending size to 0x" << std::hex << end - << std::dec << dendl; + << std::dec << dendl; o->onode.size = end; } if (benefit >= g_conf->bluestore_gc_enable_total_threshold) { - dout(20) << __func__ << " perform garbage collection, expected benefit = " - << benefit << " AUs" << dendl; - auto& extents_to_collect = gc.get_extents_to_collect(); - for (auto it = extents_to_collect.begin(); - it != extents_to_collect.end(); - ++it) { - bufferlist bl; - int r = _do_read(c.get(), o, it->offset, it->length, bl, 0); - assert(r == (int)it->length); - o->extent_map.fault_range(db, it->offset, it->length); - _do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc); - logger->inc(l_bluestore_gc_merged, it->length); - was_gc = true; - if (dirty_start > it->offset) { - dirty_start = it->offset; - } - if (dirty_end < it->offset + it->length) { - dirty_end = it->offset + it->length; + if (!gc.get_extents_to_collect().empty()) { + dout(20) << __func__ << " perform garbage collection, " + << "expected benefit = " << benefit << " AUs" << dendl; + r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end); + if (r < 0) { + derr << __func__ << " _do_gc failed with " << cpp_strerror(r) + << dendl; + goto out; } } } - if (was_gc) { - dout(30) << __func__ << " alloc write for GC" << dendl; - r = _do_alloc_write(txc, c, o, &wctx_gc); - if (r < 0) { - derr << __func__ << " _do_alloc_write(gc) failed with " << cpp_strerror(r) - << dendl; - goto out; - } - _wctx_finish(txc, c, o, &wctx_gc); - } o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start); o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start); + r = 0; out: @@ -10162,9 +10202,9 @@ int BlueStore::_do_write( int BlueStore::_write(TransContext *txc, CollectionRef& c, OnodeRef& o, - uint64_t offset, size_t length, - bufferlist& bl, - uint32_t fadvise_flags) + uint64_t offset, size_t length, + bufferlist& bl, + uint32_t fadvise_flags) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << "~" << length << std::dec diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 04c299a9aea..bd73e0486f0 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2527,6 +2527,19 @@ private: void _pad_zeros(bufferlist *bl, uint64_t *offset, uint64_t chunk_size); + void _choose_write_options(CollectionRef& c, + OnodeRef o, + uint32_t fadvise_flags, + WriteContext *wctx); + + int _do_gc(TransContext *txc, + CollectionRef& c, + OnodeRef o, + const GarbageCollector& gc, + const WriteContext& wctx, + uint64_t *dirty_start, + uint64_t *dirty_end); + int _do_write(TransContext *txc, CollectionRef &c, OnodeRef o, -- 2.39.5