From: Igor Fedotov Date: Fri, 11 Nov 2022 14:31:19 +0000 (+0300) Subject: os/bluestore: introduce a cooldown period for failed BlueFS allocations. X-Git-Tag: v18.1.0~445^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e52bcc852ce51ab99138420f9069e2f59e1cb706;p=ceph.git os/bluestore: introduce a cooldown period for failed BlueFS allocations. When using bluefs_shared_alloc_size one might get a long-lasting state when that large chunks are not available any more and fallback to shared device min alloc size occurs. The introduced cooldown is intended to prevent repetitive allocation attempts with bluefs_shared_alloc_size for a while. The rationale is to eliminate performance penalty these failing attempts might cause. Signed-off-by: Igor Fedotov --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index bfab547af220..29bd39c704f6 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4025,6 +4025,18 @@ options: desc: Allocation unit size for primary/shared device default: 64_K with_legacy: true +- name: bluefs_failed_shared_alloc_cooldown + type: float + level: advanced + desc: duration(in seconds) untill the next attempt to use + 'bluefs_shared_alloc_size' after facing ENOSPC failure. + long_desc: Cooldown period(in seconds) when BlueFS uses shared/slow device + allocation size instead of "bluefs_shared_alloc_size' one after facing + recoverable (via fallback to smaller chunk size) ENOSPC failure. Intended + primarily to avoid repetitive unsuccessful allocations which might be + expensive. + default: 600 + with_legacy: true - name: bluefs_max_prefetch type: size level: advanced @@ -4181,7 +4193,7 @@ options: - name: bluestore_bluefs_alloc_failure_dump_interval type: float level: advanced - desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure + desc: How frequently (in seconds) to dump allocator on BlueFS space allocation failure default: 0 with_legacy: true - name: bluestore_spdk_mem diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index c37d28ce751b..adfe9d0800d2 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1,6 +1,6 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab - +#include #include "boost/algorithm/string.hpp" #include "bluestore_common.h" #include "BlueFS.h" @@ -28,6 +28,8 @@ using std::set; using std::string; using std::to_string; using std::vector; +using std::chrono::duration; +using std::chrono::seconds; using ceph::bufferlist; using ceph::decode; @@ -3738,17 +3740,37 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, bool permit_dev_fallback) { dout(10) << __func__ << " len 0x" << std::hex << len - << " alloc unit hint 0x" << alloc_unit - << std::dec << " from " << (int)id << dendl; + << " au 0x" << alloc_unit + << std::dec << " from " << (int)id + << " cooldown " << cooldown_deadline + << dendl; ceph_assert(id < alloc.size()); int64_t alloc_len = 0; PExtentVector extents; uint64_t hint = 0; int64_t need = len; + bool shared = is_shared_alloc(id); + auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0; + bool was_cooldown = false; if (alloc[id]) { if (!alloc_unit) { alloc_unit = alloc_size[id]; } + // do not attempt shared_allocator with bluefs alloc unit + // when cooling down, fallback to slow dev alloc unit. + if (shared && alloc_unit != shared_unit) { + if (duration_cast(real_clock::now().time_since_epoch()).count() < + cooldown_deadline) { + logger->inc(l_bluefs_alloc_shared_size_fallbacks); + alloc_unit = shared_unit; + was_cooldown = true; + } else if (cooldown_deadline.fetch_and(0)) { + // we might get false cooldown_deadline reset at this point + // but that's mostly harmless. + dout(1) << __func__ << " shared allocation cooldown period elapsed" + << dendl; + } + } need = round_up_to(len, alloc_unit); if (!node->extents.empty() && node->extents.back().bdev == id) { hint = node->extents.back().end(); @@ -3762,6 +3784,14 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, if (alloc_len > 0) { alloc[id]->release(extents); } + if (!was_cooldown && shared) { + auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown; + cooldown_deadline = delay_s + + duration_cast(real_clock::now().time_since_epoch()).count(); + dout(1) << __func__ << " shared allocation cooldown set for " + << delay_s << "s" + << dendl; + } dout(1) << __func__ << " unable to allocate 0x" << std::hex << need << " on bdev " << (int)id << ", allocator name " << alloc[id]->get_name() @@ -3778,8 +3808,8 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, << " unable to allocate 0x" << std::hex << need << " on bdev " << (int)id << std::dec << dendl; } - if (alloc[id] && is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) { - alloc_unit = shared_alloc->alloc_unit; + if (alloc[id] && shared && alloc_unit != shared_unit) { + alloc_unit = shared_unit; dout(20) << __func__ << " fallback to bdev " << (int)id << " with alloc unit 0x" << std::hex << alloc_unit @@ -3815,7 +3845,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len, logger->set(max_bytes_pcounters[id], used); max_bytes[id] = used; } - if (is_shared_alloc(id)) { + if (shared) { shared_alloc->bluefs_used += alloc_len; } } diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 0f3a1729f199..1b4cef63e444 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -399,6 +399,7 @@ private: inline bool is_shared_alloc(unsigned id) const { return id == shared_alloc_id; } + std::atomic cooldown_deadline = 0; class SocketHook; SocketHook* asok_hook = nullptr;