From 32f63579c7d86a054c55a032ab218a8e8886f3da Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Fri, 11 Nov 2022 17:31:19 +0300 Subject: [PATCH] os/bluestore: introduce a cooldown period for failed BlueFS allocations. When using bluefs_shared_alloc_size one might get a long-lasting state when that large chunks are not available any more and fallback to shared device min alloc size occurs. The introduced cooldown is intended to prevent repetitive allocation attempts with bluefs_shared_alloc_size for a while. The rationale is to eliminate performance penalty these failing attempts might cause. Signed-off-by: Igor Fedotov (cherry picked from commit e52bcc852ce51ab99138420f9069e2f59e1cb706) --- src/common/options/global.yaml.in | 14 +++++++++- src/os/bluestore/BlueFS.cc | 43 ++++++++++++++++++++++++++----- src/os/bluestore/BlueFS.h | 1 + 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 3bba58829695..43fd6abe1d71 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4005,6 +4005,18 @@ options: desc: Allocation unit size for primary/shared device default: 64_K with_legacy: true +- name: bluefs_failed_shared_alloc_cooldown + type: float + level: advanced + desc: duration(in seconds) untill the next attempt to use + 'bluefs_shared_alloc_size' after facing ENOSPC failure. + long_desc: Cooldown period(in seconds) when BlueFS uses shared/slow device + allocation size instead of "bluefs_shared_alloc_size' one after facing + recoverable (via fallback to smaller chunk size) ENOSPC failure. Intended + primarily to avoid repetitive unsuccessful allocations which might be + expensive. + default: 600 + with_legacy: true - name: bluefs_max_prefetch type: size level: advanced @@ -4161,7 +4173,7 @@ options: - name: bluestore_bluefs_alloc_failure_dump_interval type: float level: advanced - desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure + desc: How frequently (in seconds) to dump allocator on BlueFS space allocation failure default: 0 with_legacy: true - name: bluestore_spdk_mem diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index bc6ed0260920..b21ecb4fe367 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1,6 +1,6 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab - +#include #include "boost/algorithm/string.hpp" #include "bluestore_common.h" #include "BlueFS.h" @@ -28,6 +28,9 @@ using std::set; using std::string; using std::to_string; using std::vector; +using std::chrono::duration; +using std::chrono::duration_cast; +using std::chrono::seconds; using ceph::bufferlist; using ceph::decode; @@ -3729,17 +3732,37 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, bool permit_dev_fallback) { dout(10) << __func__ << " len 0x" << std::hex << len - << " alloc unit hint 0x" << alloc_unit - << std::dec << " from " << (int)id << dendl; + << " au 0x" << alloc_unit + << std::dec << " from " << (int)id + << " cooldown " << cooldown_deadline + << dendl; ceph_assert(id < alloc.size()); int64_t alloc_len = 0; PExtentVector extents; uint64_t hint = 0; int64_t need = len; + bool shared = is_shared_alloc(id); + auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0; + bool was_cooldown = false; if (alloc[id]) { if (!alloc_unit) { alloc_unit = alloc_size[id]; } + // do not attempt shared_allocator with bluefs alloc unit + // when cooling down, fallback to slow dev alloc unit. + if (shared && alloc_unit != shared_unit) { + if (duration_cast(real_clock::now().time_since_epoch()).count() < + cooldown_deadline) { + logger->inc(l_bluefs_alloc_shared_size_fallbacks); + alloc_unit = shared_unit; + was_cooldown = true; + } else if (cooldown_deadline.fetch_and(0)) { + // we might get false cooldown_deadline reset at this point + // but that's mostly harmless. + dout(1) << __func__ << " shared allocation cooldown period elapsed" + << dendl; + } + } need = round_up_to(len, alloc_unit); if (!node->extents.empty() && node->extents.back().bdev == id) { hint = node->extents.back().end(); @@ -3753,6 +3776,14 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, if (alloc_len > 0) { alloc[id]->release(extents); } + if (!was_cooldown && shared) { + auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown; + cooldown_deadline = delay_s + + duration_cast(real_clock::now().time_since_epoch()).count(); + dout(1) << __func__ << " shared allocation cooldown set for " + << delay_s << "s" + << dendl; + } dout(1) << __func__ << " unable to allocate 0x" << std::hex << need << " on bdev " << (int)id << ", allocator name " << alloc[id]->get_name() @@ -3769,8 +3800,8 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, << " unable to allocate 0x" << std::hex << need << " on bdev " << (int)id << std::dec << dendl; } - if (alloc[id] && is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) { - alloc_unit = shared_alloc->alloc_unit; + if (alloc[id] && shared && alloc_unit != shared_unit) { + alloc_unit = shared_unit; dout(20) << __func__ << " fallback to bdev " << (int)id << " with alloc unit 0x" << std::hex << alloc_unit @@ -3806,7 +3837,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, logger->set(max_bytes_pcounters[id], used); max_bytes[id] = used; } - if (is_shared_alloc(id)) { + if (shared) { shared_alloc->bluefs_used += alloc_len; } } diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 283b04d3af4e..736450c5e993 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -399,6 +399,7 @@ private: inline bool is_shared_alloc(unsigned id) const { return id == shared_alloc_id; } + std::atomic cooldown_deadline = 0; class SocketHook; SocketHook* asok_hook = nullptr; -- 2.47.3