From: Sage Weil Date: Mon, 11 Jan 2016 19:42:11 +0000 (-0500) Subject: os/bluestore: simplify rebalance_freespace X-Git-Tag: v10.0.3~43^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=092c5eacd149ce8f351ee057ad1a04750bd136d7;p=ceph.git os/bluestore: simplify rebalance_freespace - simplify tunables (min/max ratio of freespace between bluefs vs bluestore) - add reclaim support Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 7b554c94c98..bda1653533b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -849,10 +849,11 @@ OPTION(bluefs_min_flush_size, OPT_U64, 65536) // ignore flush until its this bi OPTION(bluestore_bluefs, OPT_BOOL, true) OPTION(bluestore_bluefs_env_mirror, OPT_BOOL, false) // mirror to normal Env for debug -OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .01) -OPTION(bluestore_bluefs_min_free_ratio, OPT_FLOAT, .1) -OPTION(bluestore_bluefs_max_free_fs_main_ratio, OPT_FLOAT, .8) -OPTION(bluestore_bluefs_min_gift_ratio, OPT_FLOAT, 1) +OPTION(bluestore_bluefs_min, OPT_U64, 1*1024*1024*1024) // 1gb +OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT, .02) // min fs free / total free +OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT, .90) // max fs free / total free +OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT, .02) // how much to add at a time +OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT, .20) // how much to reclaim at a time OPTION(bluestore_block_path, OPT_STR, "") OPTION(bluestore_block_size, OPT_U64, 10 * 1024*1024*1024) // 10gb for testing OPTION(bluestore_block_db_path, OPT_STR, "") diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 7574ddb6ebe..878bb39e0c0 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1182,8 +1182,12 @@ int BlueStore::_open_db(bool create) // note: we might waste a 4k block here if block.db is used, but it's // simpler. uint64_t initial = - bdev->get_size() * g_conf->bluestore_bluefs_min_ratio; + bdev->get_size() * (g_conf->bluestore_bluefs_min_ratio + + g_conf->bluestore_bluefs_gift_ratio); + initial = MAX(initial, g_conf->bluestore_bluefs_min); + // align to bluefs's alloc_size initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size); + initial += g_conf->bluefs_alloc_size - BLUEFS_START; bluefs->add_block_extent(id, BLUEFS_START, initial); bluefs_extents.insert(BLUEFS_START, initial); } @@ -1365,7 +1369,8 @@ int BlueStore::_reconcile_bluefs_freespace() return 0; } -int BlueStore::_balance_bluefs_freespace(vector *extents) +int BlueStore::_balance_bluefs_freespace(vector *extents, + KeyValueDB::Transaction t) { int ret = 0; assert(bluefs); @@ -1383,73 +1388,97 @@ int BlueStore::_balance_bluefs_freespace(vector *extents) uint64_t total = bdev->get_size(); float my_free_ratio = (float)my_free / (float)total; - dout(10) << __func__ << " bluefs " << pretty_si_t(bluefs_free) - << " free of " << pretty_si_t(bluefs_total) - << " free_ratio " << bluefs_free_ratio << dendl; - dout(10) << __func__ << " bluestore " << pretty_si_t(my_free) - << " free of " << pretty_si_t(total) - << " free_ratio " << my_free_ratio << dendl; + uint64_t total_free = bluefs_free + my_free; + + float bluefs_ratio = (float)bluefs_free / (float)total_free; + + dout(10) << __func__ + << " bluefs " << pretty_si_t(bluefs_free) + << " free (" << bluefs_free_ratio + << ") bluestore " << pretty_si_t(my_free) + << " free (" << my_free_ratio + << "), bluefs_ratio " << bluefs_ratio + << dendl; uint64_t gift = 0; - if (bluefs_free_ratio < g_conf->bluestore_bluefs_min_free_ratio && - bluefs_free_ratio < my_free_ratio) { - // give it more - gift = g_conf->bluestore_bluefs_min_free_ratio * bluefs_total; - dout(10) << __func__ << " bluefs_free_ratio " << bluefs_free_ratio - << " < min_free_ratio " << g_conf->bluestore_bluefs_min_free_ratio - << ", should gift " << pretty_si_t(gift) << dendl; - } - float bluefs_ratio = (float)bluefs_total / (float)total; + uint64_t reclaim = 0; if (bluefs_ratio < g_conf->bluestore_bluefs_min_ratio) { - uint64_t g = total * g_conf->bluestore_bluefs_min_ratio; + gift = g_conf->bluestore_bluefs_gift_ratio * total_free; dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio << " < min_ratio " << g_conf->bluestore_bluefs_min_ratio + << ", should gift " << pretty_si_t(gift) << dendl; + } else if (bluefs_ratio > g_conf->bluestore_bluefs_max_ratio) { + reclaim = g_conf->bluestore_bluefs_reclaim_ratio * total_free; + if (bluefs_total - reclaim < g_conf->bluestore_bluefs_min) + reclaim = bluefs_total - g_conf->bluestore_bluefs_min; + dout(10) << __func__ << " bluefs_ratio " << bluefs_ratio + << " > max_ratio " << g_conf->bluestore_bluefs_max_ratio + << ", should reclaim " << pretty_si_t(reclaim) << dendl; + } + if (bluefs_total < g_conf->bluestore_bluefs_min) { + uint64_t g = g_conf->bluestore_bluefs_min; + dout(10) << __func__ << " bluefs_total " << bluefs_total + << " < min " << g_conf->bluestore_bluefs_min << ", should gift " << pretty_si_t(g) << dendl; if (g > gift) gift = g; + reclaim = 0; } - float fs_main_ratio = (float)bluefs_free / (float)my_free; - dout(10) << __func__ << " fs:main free ratio " << fs_main_ratio << dendl; - if (gift) { - float gift_ratio = (float)gift / (float)bluefs_free; - if (gift_ratio < g_conf->bluestore_bluefs_min_gift_ratio) { - dout(10) << __func__ << " proposed gift of " << pretty_si_t(gift) - << " gift_ratio " << gift_ratio - << " < min_gift_ratio " << g_conf->bluestore_bluefs_min_gift_ratio - << dendl; - } else { - // round up to alloc size - uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size; - gift = ROUND_UP_TO(gift, min_alloc_size); + // round up to alloc size + uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size; + gift = ROUND_UP_TO(gift, min_alloc_size); - // hard cap to fit into 32 bits - gift = MIN(gift, 1ull<<31); - dout(10) << __func__ << " gifting " << gift + // hard cap to fit into 32 bits + gift = MIN(gift, 1ull<<31); + dout(10) << __func__ << " gifting " << gift << " (" << pretty_si_t(gift) << ")" << dendl; - // fixme: just do one allocation to start... - int r = alloc->reserve(gift); - assert(r == 0); - - bluestore_extent_t e; - r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length); - if (r < 0) { - assert(0 == "allocate failed, wtf"); - return r; - } - if (e.length < gift) { - alloc->unreserve(gift - e.length); - } + // fixme: just do one allocation to start... + int r = alloc->reserve(gift); + assert(r == 0); - dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl; - extents->push_back(e); - ret = 1; + bluestore_extent_t e; + r = alloc->allocate(gift, min_alloc_size, 0, &e.offset, &e.length); + if (r < 0) { + assert(0 == "allocate failed, wtf"); + return r; } + if (e.length < gift) { + alloc->unreserve(gift - e.length); + } + + dout(1) << __func__ << " gifting " << e << " to bluefs" << dendl; + extents->push_back(e); + ret = 1; } - // FIXME: reclaim from bluefs? + // reclaim from bluefs? + if (reclaim) { + // round up to alloc size + uint64_t min_alloc_size = g_conf->bluestore_min_alloc_size; + reclaim = ROUND_UP_TO(reclaim, min_alloc_size); + + // hard cap to fit into 32 bits + reclaim = MIN(reclaim, 1ull<<31); + dout(10) << __func__ << " reclaiming " << reclaim + << " (" << pretty_si_t(reclaim) << ")" << dendl; + + uint64_t offset = 0; + uint32_t length = 0; + + // NOTE: this will block and do IO. + int r = bluefs->reclaim_blocks(bluefs_shared_bdev, reclaim, + &offset, &length); + assert(r >= 0); + + bluefs_extents.erase(offset, length); + + fm->release(offset, length, t); + alloc->release(offset, length); + ret = 1; + } return ret; } @@ -3525,7 +3554,7 @@ void BlueStore::_kv_sync_thread() vector bluefs_gift_extents; if (bluefs) { - int r = _balance_bluefs_freespace(&bluefs_gift_extents); + int r = _balance_bluefs_freespace(&bluefs_gift_extents, t); assert(r >= 0); if (r > 0) { for (auto& p : bluefs_gift_extents) { diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 3a5ddaf5fee..cfabfe6f543 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -530,7 +530,8 @@ private: int _open_super_meta(); int _reconcile_bluefs_freespace(); - int _balance_bluefs_freespace(vector *extents); + int _balance_bluefs_freespace(vector *extents, + KeyValueDB::Transaction t); void _commit_bluefs_freespace(const vector& extents); CollectionRef _get_collection(coll_t cid);