From: Sage Weil Date: Thu, 2 Feb 2017 19:06:05 +0000 (-0500) Subject: os/bluestore: add bluestore_prefer_wal_size[_hdd,_ssd] options X-Git-Tag: v12.0.1~122^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bed114db0a76fb59c6d6ceace3584fd987dd8bbf;p=ceph.git os/bluestore: add bluestore_prefer_wal_size[_hdd,_ssd] options Add option to prefer a WAL write if the write is below a size threshold, even if we could avoid it. This lets you trade some write-amp (by journaling data to rocksdb) for latency in cases where the WAL device is much faster than the main device. This affects: - writes to new extents locations below min_alloc_size - writes to unallocated space below min_alloc_size - "big" writes above min_alloc_size that are below the prefer_wal_size threshold. Note that it's applied to individual blobs, not the entirety of the write, so if your have a larger write torn into two pieces/blobs that are below the threshold then they will both go through the wal. Set different defaults for HDD and SSD, since this makes more sense for HDD where seeks are expensive. Add some test cases to exercise the option. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 5f01d3f2d01..54afaef70ed 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1047,6 +1047,9 @@ OPTION(bluestore_min_alloc_size, OPT_U32, 0) OPTION(bluestore_min_alloc_size_hdd, OPT_U32, 64*1024) OPTION(bluestore_min_alloc_size_ssd, OPT_U32, 4*1024) OPTION(bluestore_max_alloc_size, OPT_U32, 0) +OPTION(bluestore_prefer_wal_size, OPT_U32, 0) +OPTION(bluestore_prefer_wal_size_hdd, OPT_U32, 32768) +OPTION(bluestore_prefer_wal_size_ssd, OPT_U32, 0) OPTION(bluestore_compression_mode, OPT_STR, "none") // force|aggressive|passive|none OPTION(bluestore_compression_algorithm, OPT_STR, "snappy") OPTION(bluestore_compression_min_blob_size, OPT_U32, 128*1024) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 4bf39ee8c63..f6f97952095 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3257,6 +3257,8 @@ const char **BlueStore::get_tracked_conf_keys() const "bluestore_compression_algorithm", "bluestore_compression_min_blob_size", "bluestore_compression_max_blob_size", + "bluestore_max_alloc_size", + "bluestore_prefer_wal_size", NULL }; return KEYS; @@ -3274,6 +3276,13 @@ void BlueStore::handle_conf_change(const struct md_config_t *conf, changed.count("bluestore_compression_max_blob_size")) { _set_compression(); } + if (changed.count("bluestore_prefer_wal_size") || + changed.count("bluestore_max_alloc_size")) { + if (bdev) { + // only after startup + _set_alloc_sizes(); + } + } } void BlueStore::_set_compression() @@ -3609,6 +3618,17 @@ void BlueStore::_set_alloc_sizes(void) max_alloc_size = cct->_conf->bluestore_max_alloc_size; + if (cct->_conf->bluestore_prefer_wal_size) { + prefer_wal_size = cct->_conf->bluestore_prefer_wal_size; + } else { + assert(bdev); + if (bdev->is_rotational()) { + prefer_wal_size = cct->_conf->bluestore_prefer_wal_size_hdd; + } else { + prefer_wal_size = cct->_conf->bluestore_prefer_wal_size_ssd; + } + } + dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size << std::dec << " order " << min_alloc_size_order << " max_alloc_size 0x" << std::hex << max_alloc_size @@ -8479,12 +8499,26 @@ void BlueStore::_do_write_small( wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); if (!g_conf->bluestore_debug_omit_block_device_write) { - b->get_blob().map_bl( - b_off, padded, - [&](uint64_t offset, bufferlist& t) { - bdev->aio_write(offset, t, - &txc->ioc, wctx->buffered); - }); + if (b_len <= prefer_wal_size) { + dout(20) << __func__ << " defering small 0x" << std::hex + << b_len << std::dec << " unused write via wal" << dendl; + bluestore_wal_op_t *op = _get_wal_op(txc, o); + op->op = bluestore_wal_op_t::OP_WRITE; + b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = padded; + } else { + b->get_blob().map_bl( + b_off, padded, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, + &txc->ioc, wctx->buffered); + }); + } } b->dirty_blob().calc_csum(b_off, padded); dout(20) << __func__ << " lex old " << *ep << dendl; @@ -8626,6 +8660,7 @@ void BlueStore::_do_write_big( int BlueStore::_do_alloc_write( TransContext *txc, CollectionRef coll, + OnodeRef& o, WriteContext *wctx) { dout(20) << __func__ << " txc " << txc @@ -8812,11 +8847,25 @@ int BlueStore::_do_alloc_write( // queue io if (!g_conf->bluestore_debug_omit_block_device_write) { - b->get_blob().map_bl( - b_off, *l, - [&](uint64_t offset, bufferlist& t) { - bdev->aio_write(offset, t, &txc->ioc, false); - }); + if (l->length() <= prefer_wal_size) { + dout(20) << __func__ << " defering small 0x" << std::hex + << l->length() << std::dec << " write via wal" << dendl; + bluestore_wal_op_t *op = _get_wal_op(txc, o); + op->op = bluestore_wal_op_t::OP_WRITE; + b->get_blob().map( + b_off, l->length(), + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = *l; + } else { + b->get_blob().map_bl( + b_off, *l, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, &txc->ioc, false); + }); + } } } if (need > 0) { @@ -9085,7 +9134,7 @@ int BlueStore::_do_write( _do_garbage_collection(txc, c, o, offset, length, &wctx); - r = _do_alloc_write(txc, c, &wctx); + r = _do_alloc_write(txc, c, o, &wctx); if (r < 0) { derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r) << dendl; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 7d3d4f4edf0..b1a070f8f72 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1770,6 +1770,7 @@ private: uint64_t min_alloc_size = 0; ///< minimum allocation unit (power of 2) size_t min_alloc_size_order = 0; ///< bits for min_alloc_size + uint64_t prefer_wal_size = 0; ///< size threshold for forced wal writes uint64_t max_alloc_size = 0; ///< maximum allocation unit (power of 2) @@ -2304,6 +2305,7 @@ private: int _do_alloc_write( TransContext *txc, CollectionRef c, + OnodeRef& o, WriteContext *wctx); void _wctx_finish( TransContext *txc, diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 4751a7751b2..17575025de1 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -4292,6 +4292,25 @@ TEST_P(StoreTestSpecificAUSize, SyntheticMatrixNoCsum) { do_matrix(m, store, doSyntheticTest); } +TEST_P(StoreTestSpecificAUSize, SyntheticMatrixPreferWAL) { + if (string(GetParam()) != "bluestore") + return; + + const char *m[][10] = { + { "bluestore_min_alloc_size", "4096", "65536", 0 }, // to be the first! + { "max_write", "65536", 0 }, + { "max_size", "1048576", 0 }, + { "alignment", "512", 0 }, + { "bluestore_max_blob_size", "262144", 0 }, + { "bluestore_compression_mode", "force", "none", 0}, + { "bluestore_prefer_wal_size", "32768", "0", 0}, + { "bluestore_sync_wal_apply_hdd", "false", 0}, + { "bluestore_sync_wal_apply_ssd", "false", 0}, + { 0 }, + }; + do_matrix(m, store, doSyntheticTest); +} + TEST_P(StoreTest, AttrSynthetic) { ObjectStore::Sequencer osr("test"); MixedGenerator gen(447);