From 1d1edb19430afe34d8cce155f8d06669cdbbddc2 Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Sun, 16 Jun 2019 21:38:03 -0400 Subject: [PATCH] os/BlueStore: Split Cache into Onode/Buffer caches Signed-off-by: Mark Nelson --- src/os/bluestore/BlueStore.cc | 957 ++++++++++--------- src/os/bluestore/BlueStore.h | 374 ++------ src/test/objectstore/test_bluestore_types.cc | 69 +- 3 files changed, 683 insertions(+), 717 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 651390b158f93..81239664094f8 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -880,454 +880,534 @@ int64_t BlueStore::GarbageCollector::estimate( return expected_for_release - expected_allocations; } -// Cache +// LruOnodeCacheShard +struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { + typedef boost::intrusive::list< + BlueStore::Onode, + boost::intrusive::member_hook< + BlueStore::Onode, + boost::intrusive::list_member_hook<>, + &BlueStore::Onode::lru_item> > list_t; + list_t lru; + + explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {} + + void _add(BlueStore::OnodeRef& o, int level) override + { + (level > 0) ? lru.push_front(*o) : lru.push_back(*o); + num = lru.size(); + } + void _rm(BlueStore::OnodeRef& o) override + { + lru.erase(lru.iterator_to(*o)); + num = lru.size(); + } + void _touch(BlueStore::OnodeRef& o) override + { + lru.erase(lru.iterator_to(*o)); + lru.push_front(*o); + num = lru.size(); + } + void _trim_to(uint64_t max) override + { + if (max >= lru.size()) { + return; // don't even try + } + uint64_t n = lru.size() - max; -BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type, - PerfCounters *logger) -{ - Cache *c = nullptr; + auto p = lru.end(); + ceph_assert(p != lru.begin()); + --p; + int skipped = 0; + int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned; + while (n > 0) { + BlueStore::Onode *o = &*p; + int refs = o->nref.load(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs, skipping" << dendl; + if (++skipped >= max_skipped) { + dout(20) << __func__ << " maximum skip pinned reached; stopping with " + << n << " left to trim" << dendl; + break; + } - if (type == "lru") - c = new LRUCache(cct); - else if (type == "2q") - c = new TwoQCache(cct); - else - ceph_abort_msg("unrecognized cache type"); + if (p == lru.begin()) { + break; + } else { + p--; + n--; + continue; + } + } + dout(30) << __func__ << " rm " << o->oid << dendl; + if (p != lru.begin()) { + lru.erase(p--); + } else { + lru.erase(p); + ceph_assert(n == 1); + } + o->get(); // paranoia + o->c->onode_map.remove(o->oid); + o->put(); + --n; + } + num = lru.size(); + } + void add_stats(uint64_t *onodes) override + { + *onodes += num; + } +}; +// OnodeCacheShard +BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create( + CephContext* cct, + string type, + PerfCounters *logger) +{ + BlueStore::OnodeCacheShard *c = nullptr; + // Currently we only implement an LRU cache for onodes + c = new LruOnodeCacheShard(cct); c->logger = logger; return c; } -void BlueStore::Cache::trim_onodes() -{ - std::lock_guard l(lock); - _trim_onodes(); -} - -void BlueStore::Cache::trim_buffers() -{ - std::lock_guard l(lock); - _trim_buffers(); -} - -void BlueStore::Cache::flush() -{ - std::lock_guard l(lock); - _trim_buffers_to(0); - _trim_onodes_to(0); -} +// LruBufferCacheShard +struct LruBufferCacheShard : public BlueStore::BufferCacheShard { + typedef boost::intrusive::list< + BlueStore::Buffer, + boost::intrusive::member_hook< + BlueStore::Buffer, + boost::intrusive::list_member_hook<>, + &BlueStore::Buffer::lru_item> > list_t; + list_t lru; + + explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {} + + void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override { + if (near) { + auto q = lru.iterator_to(*near); + lru.insert(q, *b); + } else if (level > 0) { + lru.push_front(*b); + } else { + lru.push_back(*b); + } + buffer_bytes += b->length; + num = lru.size(); + } + void _rm(BlueStore::Buffer *b) override { + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + auto q = lru.iterator_to(*b); + lru.erase(q); + num = lru.size(); + } + void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override { + src->_rm(b); + _add(b, 0, nullptr); + } + void _adjust_size(BlueStore::Buffer *b, int64_t delta) override { + ceph_assert((int64_t)buffer_bytes + delta >= 0); + buffer_bytes += delta; + } + void _touch(BlueStore::Buffer *b) override { + auto p = lru.iterator_to(*b); + lru.erase(p); + lru.push_front(*b); + num = lru.size(); + _audit("_touch_buffer end"); + } -// LRUCache -#undef dout_prefix -#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") " - -void BlueStore::LRUCache::_touch_onode(OnodeRef& o) -{ - auto p = onode_lru.iterator_to(*o); - onode_lru.erase(p); - onode_lru.push_front(*o); -} - -void BlueStore::LRUCache::_trim_onodes_to(uint64_t max) { - if (max >= onode_lru.size()) { - return; // don't even try - } - uint64_t num = onode_lru.size() - max; - - auto p = onode_lru.end(); - ceph_assert(p != onode_lru.begin()); - --p; - int skipped = 0; - int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned; - while (num > 0) { - Onode *o = &*p; - int refs = o->nref.load(); - if (refs > 1) { - dout(20) << __func__ << " " << o->oid << " has " << refs - << " refs, skipping" << dendl; - if (++skipped >= max_skipped) { - dout(20) << __func__ << " maximum skip pinned reached; stopping with " - << num << " left to trim" << dendl; + void _trim_to(uint64_t max) override + { + while (buffer_bytes > max) { + auto i = lru.rbegin(); + if (i == lru.rend()) { + // stop if lru is now empty break; } - if (p == onode_lru.begin()) { - break; - } else { - p--; - num--; - continue; - } - } - dout(30) << __func__ << " rm " << o->oid << dendl; - if (p != onode_lru.begin()) { - onode_lru.erase(p--); - } else { - onode_lru.erase(p); - ceph_assert(num == 1); + BlueStore::Buffer *b = &*i; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " rm " << *b << dendl; + b->space->_rm_buffer(this, b); } - o->get(); // paranoia - o->c->onode_map.remove(o->oid); - o->put(); - --num; + num = lru.size(); } -} - -void BlueStore::LRUCache::_trim_buffers_to(uint64_t max) { - while (buffer_size > max) { - auto i = buffer_lru.rbegin(); - if (i == buffer_lru.rend()) { - // stop if buffer_lru is now empty - break; - } - Buffer *b = &*i; - ceph_assert(b->is_clean()); - dout(20) << __func__ << " rm " << *b << dendl; - b->space->_rm_buffer(this, b); + void add_stats(uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + *extents += num_extents; + *blobs += num_blobs; + *buffers += num; + *bytes += buffer_bytes; } -} - #ifdef DEBUG_CACHE -void BlueStore::LRUCache::_audit(const char *when) -{ - dout(10) << __func__ << " " << when << " start" << dendl; - uint64_t s = 0; - for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { - s += i->length; - } - if (s != buffer_size) { - derr << __func__ << " buffer_size " << buffer_size << " actual " << s - << dendl; - for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { - derr << __func__ << " " << *i << dendl; + void _audit(const char *s) override + { + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = lru.begin(); i != lru.end(); ++i) { + s += i->length; } - ceph_assert(s == buffer_size); + if (s != buffer_bytes) { + derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s + << dendl; + for (auto i = lru.begin(); i != lru.end(); ++i) { + derr << __func__ << " " << *i << dendl; + } + ceph_assert(s == buffer_bytes); + } + dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes + << " ok" << dendl; } - dout(20) << __func__ << " " << when << " buffer_size " << buffer_size - << " ok" << dendl; -} #endif +}; -// TwoQCache -#undef dout_prefix -#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") " +// TwoQBufferCacheShard + +struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard { + typedef boost::intrusive::list< + BlueStore::Buffer, + boost::intrusive::member_hook< + BlueStore::Buffer, + boost::intrusive::list_member_hook<>, + &BlueStore::Buffer::lru_item> > list_t; + list_t hot; ///< "Am" hot buffers + list_t warm_in; ///< "A1in" newly warm buffers + list_t warm_out; ///< "A1out" empty buffers we've evicted + uint64_t buffer_bytes = 0; ///< bytes + + enum { + BUFFER_NEW = 0, + BUFFER_WARM_IN, ///< in warm_in + BUFFER_WARM_OUT, ///< in warm_out + BUFFER_HOT, ///< in hot + BUFFER_TYPE_MAX + }; + uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type -void BlueStore::TwoQCache::_touch_onode(OnodeRef& o) -{ - auto p = onode_lru.iterator_to(*o); - onode_lru.erase(p); - onode_lru.push_front(*o); -} +public: + explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {} -void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near) -{ - dout(20) << __func__ << " level " << level << " near " << near - << " on " << *b - << " which has cache_private " << b->cache_private << dendl; - if (near) { - b->cache_private = near->cache_private; + void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override + { + dout(20) << __func__ << " level " << level << " near " << near + << " on " << *b + << " which has cache_private " << b->cache_private << dendl; + if (near) { + b->cache_private = near->cache_private; + switch (b->cache_private) { + case BUFFER_WARM_IN: + warm_in.insert(warm_in.iterator_to(*near), *b); + break; + case BUFFER_WARM_OUT: + ceph_assert(b->is_empty()); + warm_out.insert(warm_out.iterator_to(*near), *b); + break; + case BUFFER_HOT: + hot.insert(hot.iterator_to(*near), *b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } else if (b->cache_private == BUFFER_NEW) { + b->cache_private = BUFFER_WARM_IN; + if (level > 0) { + warm_in.push_front(*b); + } else { + // take caller hint to start at the back of the warm queue + warm_in.push_back(*b); + } + } else { + // we got a hint from discard + switch (b->cache_private) { + case BUFFER_WARM_IN: + // stay in warm_in. move to front, even though 2Q doesn't actually + // do this. + dout(20) << __func__ << " move to front of warm " << *b << dendl; + warm_in.push_front(*b); + break; + case BUFFER_WARM_OUT: + b->cache_private = BUFFER_HOT; + // move to hot. fall-thru + case BUFFER_HOT: + dout(20) << __func__ << " move to front of hot " << *b << dendl; + hot.push_front(*b); + break; + default: + ceph_abort_msg("bad cache_private"); + } + } + if (!b->is_empty()) { + buffer_bytes += b->length; + list_bytes[b->cache_private] += b->length; + } + num = hot.size() + warm_in.size(); + } + + void _rm(BlueStore::Buffer *b) override + { + dout(20) << __func__ << " " << *b << dendl; + if (!b->is_empty()) { + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(list_bytes[b->cache_private] >= b->length); + list_bytes[b->cache_private] -= b->length; + } switch (b->cache_private) { case BUFFER_WARM_IN: - buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b); + warm_in.erase(warm_in.iterator_to(*b)); break; case BUFFER_WARM_OUT: - ceph_assert(b->is_empty()); - buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b); + warm_out.erase(warm_out.iterator_to(*b)); break; case BUFFER_HOT: - buffer_hot.insert(buffer_hot.iterator_to(*near), *b); + hot.erase(hot.iterator_to(*b)); break; default: ceph_abort_msg("bad cache_private"); } - } else if (b->cache_private == BUFFER_NEW) { - b->cache_private = BUFFER_WARM_IN; - if (level > 0) { - buffer_warm_in.push_front(*b); - } else { - // take caller hint to start at the back of the warm queue - buffer_warm_in.push_back(*b); - } - } else { - // we got a hint from discard + num = hot.size() + warm_in.size(); + } + + void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override + { + TwoQBufferCacheShard *src = static_cast(srcc); + src->_rm(b); + + // preserve which list we're on (even if we can't preserve the order!) switch (b->cache_private) { case BUFFER_WARM_IN: - // stay in warm_in. move to front, even though 2Q doesn't actually - // do this. - dout(20) << __func__ << " move to front of warm " << *b << dendl; - buffer_warm_in.push_front(*b); + ceph_assert(!b->is_empty()); + warm_in.push_back(*b); break; case BUFFER_WARM_OUT: - b->cache_private = BUFFER_HOT; - // move to hot. fall-thru + ceph_assert(b->is_empty()); + warm_out.push_back(*b); + break; case BUFFER_HOT: - dout(20) << __func__ << " move to front of hot " << *b << dendl; - buffer_hot.push_front(*b); + ceph_assert(!b->is_empty()); + hot.push_back(*b); break; default: ceph_abort_msg("bad cache_private"); } + if (!b->is_empty()) { + buffer_bytes += b->length; + list_bytes[b->cache_private] += b->length; + } + num = hot.size() + warm_in.size(); } - if (!b->is_empty()) { - buffer_bytes += b->length; - buffer_list_bytes[b->cache_private] += b->length; - } -} -void BlueStore::TwoQCache::_rm_buffer(Buffer *b) -{ - dout(20) << __func__ << " " << *b << dendl; - if (!b->is_empty()) { - ceph_assert(buffer_bytes >= b->length); - buffer_bytes -= b->length; - ceph_assert(buffer_list_bytes[b->cache_private] >= b->length); - buffer_list_bytes[b->cache_private] -= b->length; - } - switch (b->cache_private) { - case BUFFER_WARM_IN: - buffer_warm_in.erase(buffer_warm_in.iterator_to(*b)); - break; - case BUFFER_WARM_OUT: - buffer_warm_out.erase(buffer_warm_out.iterator_to(*b)); - break; - case BUFFER_HOT: - buffer_hot.erase(buffer_hot.iterator_to(*b)); - break; - default: - ceph_abort_msg("bad cache_private"); + void _adjust_size(BlueStore::Buffer *b, int64_t delta) override + { + dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; + if (!b->is_empty()) { + ceph_assert((int64_t)buffer_bytes + delta >= 0); + buffer_bytes += delta; + ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0); + list_bytes[b->cache_private] += delta; + } } -} -void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b) -{ - TwoQCache *src = static_cast(srcc); - src->_rm_buffer(b); - - // preserve which list we're on (even if we can't preserve the order!) - switch (b->cache_private) { - case BUFFER_WARM_IN: - ceph_assert(!b->is_empty()); - buffer_warm_in.push_back(*b); - break; - case BUFFER_WARM_OUT: - ceph_assert(b->is_empty()); - buffer_warm_out.push_back(*b); - break; - case BUFFER_HOT: - ceph_assert(!b->is_empty()); - buffer_hot.push_back(*b); - break; - default: - ceph_abort_msg("bad cache_private"); - } - if (!b->is_empty()) { - buffer_bytes += b->length; - buffer_list_bytes[b->cache_private] += b->length; + void _touch(BlueStore::Buffer *b) override { + switch (b->cache_private) { + case BUFFER_WARM_IN: + // do nothing (somewhat counter-intuitively!) + break; + case BUFFER_WARM_OUT: + // move from warm_out to hot LRU + ceph_abort_msg("this happens via discard hint"); + break; + case BUFFER_HOT: + // move to front of hot LRU + hot.erase(hot.iterator_to(*b)); + hot.push_front(*b); + break; + } + num = hot.size() + warm_in.size(); + _audit("_touch_buffer end"); } -} -void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta) -{ - dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; - if (!b->is_empty()) { - ceph_assert((int64_t)buffer_bytes + delta >= 0); - buffer_bytes += delta; - ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0); - buffer_list_bytes[b->cache_private] += delta; - } -} - -void BlueStore::TwoQCache::_trim_onodes_to(uint64_t max) { - if (max >= onode_lru.size()) { - return; // don't even try - } - uint64_t num = onode_lru.size() - max; - - auto p = onode_lru.end(); - ceph_assert(p != onode_lru.begin()); - --p; - int skipped = 0; - int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned; - while (num > 0) { - Onode *o = &*p; - dout(20) << __func__ << " considering " << o << dendl; - int refs = o->nref.load(); - if (refs > 1) { - dout(20) << __func__ << " " << o->oid << " has " << refs - << " refs; skipping" << dendl; - if (++skipped >= max_skipped) { - dout(20) << __func__ << " maximum skip pinned reached; stopping with " - << num << " left to trim" << dendl; - break; - } + void _trim_to(uint64_t max) override + { + if (buffer_bytes > max) { + uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio; + uint64_t khot = max - kin; + + // pre-calculate kout based on average buffer size too, + // which is typical(the warm_in and hot lists may change later) + uint64_t kout = 0; + uint64_t buffer_num = hot.size() + warm_in.size(); + if (buffer_num) { + uint64_t avg_size = buffer_bytes / buffer_num; + ceph_assert(avg_size); + uint64_t calculated_num = max / avg_size; + kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio; + } + + if (list_bytes[BUFFER_HOT] < khot) { + // hot is small, give slack to warm_in + kin += khot - list_bytes[BUFFER_HOT]; + } else if (list_bytes[BUFFER_WARM_IN] < kin) { + // warm_in is small, give slack to hot + khot += kin - list_bytes[BUFFER_WARM_IN]; + } + + // adjust warm_in list + int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin; + uint64_t evicted = 0; + + while (to_evict_bytes > 0) { + auto p = warm_in.rbegin(); + if (p == warm_in.rend()) { + // stop if warm_in list is now empty + break; + } - if (p == onode_lru.begin()) { - break; - } else { - p--; - num--; - continue; + BlueStore::Buffer *b = &*p; + ceph_assert(b->is_clean()); + dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl; + ceph_assert(buffer_bytes >= b->length); + buffer_bytes -= b->length; + ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length); + list_bytes[BUFFER_WARM_IN] -= b->length; + to_evict_bytes -= b->length; + evicted += b->length; + b->state = BlueStore::Buffer::STATE_EMPTY; + b->data.clear(); + warm_in.erase(warm_in.iterator_to(*b)); + warm_out.push_front(*b); + b->cache_private = BUFFER_WARM_OUT; + } + + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from warm_in list, done evicting warm_in buffers" + << dendl; } - } - dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<get(); // paranoia - o->c->onode_map.remove(o->oid); - o->put(); - --num; - } -} - -void BlueStore::TwoQCache::_trim_buffers_to(uint64_t max) { - if (buffer_bytes > max) { - uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio; - uint64_t khot = max - kin; - // pre-calculate kout based on average buffer size too, - // which is typical(the warm_in and hot lists may change later) - uint64_t kout = 0; - uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size(); - if (buffer_num) { - uint64_t buffer_avg_size = buffer_bytes / buffer_num; - ceph_assert(buffer_avg_size); - uint64_t calculated_buffer_num = max / buffer_avg_size; - kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio; - } + // adjust hot list + to_evict_bytes = list_bytes[BUFFER_HOT] - khot; + evicted = 0; - if (buffer_list_bytes[BUFFER_HOT] < khot) { - // hot is small, give slack to warm_in - kin += khot - buffer_list_bytes[BUFFER_HOT]; - } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) { - // warm_in is small, give slack to hot - khot += kin - buffer_list_bytes[BUFFER_WARM_IN]; - } + while (to_evict_bytes > 0) { + auto p = hot.rbegin(); + if (p == hot.rend()) { + // stop if hot list is now empty + break; + } - // adjust warm_in list - int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin; - uint64_t evicted = 0; + BlueStore::Buffer *b = &*p; + dout(20) << __func__ << " buffer_hot rm " << *b << dendl; + ceph_assert(b->is_clean()); + // adjust evict size before buffer goes invalid + to_evict_bytes -= b->length; + evicted += b->length; + b->space->_rm_buffer(this, b); + } - while (to_evict_bytes > 0) { - auto p = buffer_warm_in.rbegin(); - if (p == buffer_warm_in.rend()) { - // stop if warm_in list is now empty - break; + if (evicted > 0) { + dout(20) << __func__ << " evicted " << byte_u_t(evicted) + << " from hot list, done evicting hot buffers" + << dendl; } - Buffer *b = &*p; - ceph_assert(b->is_clean()); - dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl; - ceph_assert(buffer_bytes >= b->length); - buffer_bytes -= b->length; - ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length); - buffer_list_bytes[BUFFER_WARM_IN] -= b->length; - to_evict_bytes -= b->length; - evicted += b->length; - b->state = Buffer::STATE_EMPTY; - b->data.clear(); - buffer_warm_in.erase(buffer_warm_in.iterator_to(*b)); - buffer_warm_out.push_front(*b); - b->cache_private = BUFFER_WARM_OUT; - } - - if (evicted > 0) { - dout(20) << __func__ << " evicted " << byte_u_t(evicted) - << " from warm_in list, done evicting warm_in buffers" - << dendl; + // adjust warm out list too, if necessary + int64_t n = warm_out.size() - kout; + while (n-- > 0) { + BlueStore::Buffer *b = &*warm_out.rbegin(); + ceph_assert(b->is_empty()); + dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl; + b->space->_rm_buffer(this, b); + } } + num = hot.size() + warm_in.size(); + } - // adjust hot list - to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot; - evicted = 0; - - while (to_evict_bytes > 0) { - auto p = buffer_hot.rbegin(); - if (p == buffer_hot.rend()) { - // stop if hot list is now empty - break; - } + void add_stats(uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) override { + *extents += num_extents; + *blobs += num_blobs; + *buffers += num; + *bytes += buffer_bytes; + } - Buffer *b = &*p; - dout(20) << __func__ << " buffer_hot rm " << *b << dendl; - ceph_assert(b->is_clean()); - // adjust evict size before buffer goes invalid - to_evict_bytes -= b->length; - evicted += b->length; - b->space->_rm_buffer(this, b); +#ifdef DEBUG_CACHE + void _audit(const char *s) override + { + dout(10) << __func__ << " " << when << " start" << dendl; + uint64_t s = 0; + for (auto i = hot.begin(); i != hot.end(); ++i) { + s += i->length; } - if (evicted > 0) { - dout(20) << __func__ << " evicted " << byte_u_t(evicted) - << " from hot list, done evicting hot buffers" - << dendl; + uint64_t hot_bytes = s; + if (hot_bytes != list_bytes[BUFFER_HOT]) { + derr << __func__ << " hot_list_bytes " + << list_bytes[BUFFER_HOT] + << " != actual " << hot_bytes + << dendl; + ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]); } - // adjust warm out list too, if necessary - int64_t num = buffer_warm_out.size() - kout; - while (num-- > 0) { - Buffer *b = &*buffer_warm_out.rbegin(); - ceph_assert(b->is_empty()); - dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl; - b->space->_rm_buffer(this, b); + for (auto i = warm_in.begin(); i != warm_in.end(); ++i) { + s += i->length; } - } -} - -#ifdef DEBUG_CACHE -void BlueStore::TwoQCache::_audit(const char *when) -{ - dout(10) << __func__ << " " << when << " start" << dendl; - uint64_t s = 0; - for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) { - s += i->length; - } - uint64_t hot_bytes = s; - if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) { - derr << __func__ << " hot_list_bytes " - << buffer_list_bytes[BUFFER_HOT] - << " != actual " << hot_bytes - << dendl; - ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]); - } + uint64_t warm_in_bytes = s - hot_bytes; + if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) { + derr << __func__ << " warm_in_list_bytes " + << list_bytes[BUFFER_WARM_IN] + << " != actual " << warm_in_bytes + << dendl; + ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]); + } - for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) { - s += i->length; - } + if (s != buffer_bytes) { + derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s + << dendl; + ceph_assert(s == buffer_bytes); + } - uint64_t warm_in_bytes = s - hot_bytes; - if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) { - derr << __func__ << " warm_in_list_bytes " - << buffer_list_bytes[BUFFER_WARM_IN] - << " != actual " << warm_in_bytes - << dendl; - ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]); + dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes + << " ok" << dendl; } +#endif +}; - if (s != buffer_bytes) { - derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s - << dendl; - ceph_assert(s == buffer_bytes); - } +// BuferCacheShard - dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes - << " ok" << dendl; +BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create( + CephContext* cct, + string type, + PerfCounters *logger) +{ + BufferCacheShard *c = nullptr; + if (type == "lru") + c = new LruBufferCacheShard(cct); + else if (type == "2q") + c = new TwoQBufferCacheShard(cct); + else + ceph_abort_msg("unrecognized cache type"); + c->logger = logger; + return c; } -#endif - // BufferSpace #undef dout_prefix #define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " -void BlueStore::BufferSpace::_clear(Cache* cache) +void BlueStore::BufferSpace::_clear(BufferCacheShard* cache) { // note: we already hold cache->lock ldout(cache->cct, 20) << __func__ << dendl; @@ -1336,7 +1416,7 @@ void BlueStore::BufferSpace::_clear(Cache* cache) } } -int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length) +int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) { // note: we already hold cache->lock ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length @@ -1369,7 +1449,7 @@ int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t len 0, b); } if (!b->is_writing()) { - cache->_adjust_buffer_size(b, front - (int64_t)b->length); + cache->_adjust_size(b, front - (int64_t)b->length); } b->truncate(front); b->maybe_rebuild(); @@ -1378,7 +1458,7 @@ int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t len } else { // drop tail if (!b->is_writing()) { - cache->_adjust_buffer_size(b, front - (int64_t)b->length); + cache->_adjust_size(b, front - (int64_t)b->length); } b->truncate(front); b->maybe_rebuild(); @@ -1406,12 +1486,11 @@ int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t len cache->_audit("discard end 2"); break; } - cache->_trim_buffers(); return cache_private; } void BlueStore::BufferSpace::read( - Cache* cache, + BufferCacheShard* cache, uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, @@ -1445,7 +1524,7 @@ void BlueStore::BufferSpace::read( offset += l; length -= l; if (!b->is_writing()) { - cache->_touch_buffer(b); + cache->_touch(b); } continue; } @@ -1458,7 +1537,7 @@ void BlueStore::BufferSpace::read( length -= gap; } if (!b->is_writing()) { - cache->_touch_buffer(b); + cache->_touch(b); } if (b->length > length) { res[offset].substr_of(b->data, 0, length); @@ -1483,7 +1562,7 @@ void BlueStore::BufferSpace::read( cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes); } -void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq) +void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq) { auto i = writing.begin(); while (i != writing.end()) { @@ -1507,15 +1586,15 @@ void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq) writing.erase(i++); b->maybe_rebuild(); b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); - cache->_add_buffer(b, 1, nullptr); + cache->_add(b, 1, nullptr); ldout(cache->cct, 20) << __func__ << " added " << *b << dendl; } } - cache->_trim_buffers(); + cache->_trim(); cache->_audit("finish_write end"); } -void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r) +void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r) { std::lock_guard lk(cache->lock); if (buffer_map.empty()) @@ -1539,7 +1618,7 @@ void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSp r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right), 0, p->second.get()); } - cache->_adjust_buffer_size(p->second.get(), -right); + cache->_adjust_size(p->second.get(), -right); p->second->truncate(left); break; } @@ -1563,7 +1642,7 @@ void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSp } } ceph_assert(writing.empty()); - cache->_trim_buffers(); + cache->_trim(); } // OnodeSpace @@ -1583,8 +1662,8 @@ BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o } ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl; onode_map[oid] = o; - cache->_add_onode(o, 1); - cache->_trim_onodes(); + cache->_add(o, 1); + cache->_trim(); return o; } @@ -1602,7 +1681,7 @@ BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) } else { ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second << dendl; - cache->_touch_onode(p->second); + cache->_touch(p->second); hit = true; o = p->second; } @@ -1621,7 +1700,7 @@ void BlueStore::OnodeSpace::clear() std::lock_guard l(cache->lock); ldout(cache->cct, 10) << __func__ << dendl; for (auto &p : onode_map) { - cache->_rm_onode(p.second); + cache->_rm(p.second); } onode_map.clear(); } @@ -1650,7 +1729,7 @@ void BlueStore::OnodeSpace::rename( if (pn != onode_map.end()) { ldout(cache->cct, 30) << __func__ << " removing target " << pn->second << dendl; - cache->_rm_onode(pn->second); + cache->_rm(pn->second); onode_map.erase(pn); } OnodeRef o = po->second; @@ -1658,13 +1737,13 @@ void BlueStore::OnodeSpace::rename( // install a non-existent onode at old location oldo.reset(new Onode(o->c, old_oid, o->key)); po->second = oldo; - cache->_add_onode(po->second, 1); - cache->_trim_onodes(); + cache->_add(po->second, 1); // add at new position and fix oid, key onode_map.insert(make_pair(new_oid, o)); - cache->_touch_onode(o); + cache->_touch(o); o->oid = new_oid; o->key = new_okey; + cache->_trim(); } bool BlueStore::OnodeSpace::map_any(std::function f) @@ -1772,7 +1851,7 @@ void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length, void BlueStore::SharedBlob::finish_write(uint64_t seq) { while (true) { - Cache *cache = coll->cache; + BufferCacheShard *cache = coll->cache; std::lock_guard l(cache->lock); if (coll->cache != cache) { ldout(coll->store->cct, 20) << __func__ @@ -2541,9 +2620,9 @@ void BlueStore::ExtentMap::reshard( bool was_too_many_blobs_check = false; auto too_many_blobs_threshold = g_conf()->bluestore_debug_too_many_blobs_threshold; - auto& dumped_onodes = onode->c->cache->dumped_onodes; - decltype(onode->c->cache->dumped_onodes)::value_type* oid_slot = nullptr; - decltype(onode->c->cache->dumped_onodes)::value_type* oldest_slot = nullptr; + auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes; + decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr; + decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr; for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) { if (e->logical_offset >= needs_reshard_end) { @@ -3388,13 +3467,13 @@ void BlueStore::DeferredBatch::_audit(CephContext *cct) #undef dout_prefix #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") " -BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid) +BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid) : CollectionImpl(cid), store(store_), - cache(c), + cache(bc), lock("BlueStore::Collection::lock", true, false), exists(true), - onode_map(c), + onode_map(oc), commit_queue(nullptr) { } @@ -3588,13 +3667,13 @@ void BlueStore::Collection::split_cache( ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid << dendl; - cache->_rm_onode(p->second); + onode_map.cache->_rm(p->second); p = onode_map.onode_map.erase(p); o->c = dest; - dest->cache->_add_onode(o, 1); + dest->onode_map.cache->_add(o, 1); dest->onode_map.onode_map[o->oid] = o; - dest->onode_map.cache = dest->cache; + dest->onode_map.cache = dest->onode_map.cache; // move over shared blobs and buffers. cover shared blobs from // both extent map and spanning blob map (the full extent map @@ -3625,14 +3704,14 @@ void BlueStore::Collection::split_cache( if (!i.second->is_writing()) { ldout(store->cct, 20) << __func__ << " moving " << *i.second << dendl; - dest->cache->_move_buffer(cache, i.second.get()); + dest->cache->_move(cache, i.second.get()); } } } } } } - dest->cache->_trim_onodes(); + dest->cache->_trim(); } // ======================================================= @@ -3724,8 +3803,8 @@ void BlueStore::MempoolThread::_adjust_cache_settings() void BlueStore::MempoolThread::_resize_shards(bool interval_stats) { auto cct = store->cct; - size_t num_shards = store->cache_shards.size(); - + size_t onode_shards = store->onode_cache_shards.size(); + size_t buffer_shards = store->buffer_cache_shards.size(); int64_t kv_used = store->db->get_cache_usage(); int64_t meta_used = meta_cache->_get_used_bytes(); int64_t data_used = data_cache->_get_used_bytes(); @@ -3764,15 +3843,17 @@ void BlueStore::MempoolThread::_resize_shards(bool interval_stats) } uint64_t max_shard_onodes = static_cast( - (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode()); - uint64_t max_shard_buffer = static_cast(data_alloc / num_shards); + (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode()); + uint64_t max_shard_buffer = static_cast(data_alloc / buffer_shards); ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes << " max_shard_buffer: " << max_shard_buffer << dendl; - for (auto i : store->cache_shards) { - i->set_onode_max(max_shard_onodes); - i->set_buffer_max(max_shard_buffer); + for (auto i : store->onode_cache_shards) { + i->set_max(max_shard_onodes); + } + for (auto i : store->buffer_cache_shards) { + i->set_max(max_shard_buffer); } } @@ -3998,10 +4079,14 @@ BlueStore::~BlueStore() ceph_assert(bluefs == NULL); ceph_assert(fsid_fd < 0); ceph_assert(path_fd < 0); - for (auto i : cache_shards) { + for (auto i : onode_cache_shards) { delete i; } - cache_shards.clear(); + for (auto i : buffer_cache_shards) { + delete i; + } + onode_cache_shards.clear(); + buffer_cache_shards.clear(); } const char **BlueStore::get_tracked_conf_keys() const @@ -5783,7 +5868,8 @@ int BlueStore::_open_collections(int *errors) CollectionRef c( new Collection( this, - cache_shards[cid.hash_to_shard(cache_shards.size())], + onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], + buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], cid)); bufferlist bl = it->value(); auto p = bl.cbegin(); @@ -6538,12 +6624,20 @@ int BlueStore::expand_devices(ostream& out) void BlueStore::set_cache_shards(unsigned num) { dout(10) << __func__ << " " << num << dendl; - size_t old = cache_shards.size(); - ceph_assert(num >= old); - cache_shards.resize(num); - for (unsigned i = old; i < num; ++i) { - cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type, - logger); + size_t oold = onode_cache_shards.size(); + size_t bold = buffer_cache_shards.size(); + ceph_assert(num >= oold && num >= bold); + onode_cache_shards.resize(num); + buffer_cache_shards.resize(num); + for (unsigned i = oold; i < num; ++i) { + onode_cache_shards[i] = + OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type, + logger); + } + for (unsigned i = bold; i < num; ++i) { + buffer_cache_shards[i] = + BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type, + logger); } } @@ -8292,9 +8386,12 @@ void BlueStore::_update_cache_logger() uint64_t num_blobs = 0; uint64_t num_buffers = 0; uint64_t num_buffer_bytes = 0; - for (auto c : cache_shards) { - c->add_stats(&num_onodes, &num_extents, &num_blobs, - &num_buffers, &num_buffer_bytes); + for (auto c : onode_cache_shards) { + c->add_stats(&num_onodes); + } + for (auto c : buffer_cache_shards) { + c->add_stats(&num_extents, &num_blobs, + &num_buffers, &num_buffer_bytes); } logger->set(l_bluestore_onodes, num_onodes); logger->set(l_bluestore_extents, num_extents); @@ -8317,7 +8414,8 @@ ObjectStore::CollectionHandle BlueStore::create_new_collection( RWLock::WLocker l(coll_lock); Collection *c = new Collection( this, - cache_shards[cid.hash_to_shard(cache_shards.size())], + onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())], + buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())], cid); new_coll_map[cid] = c; _osr_attach(c); @@ -13705,7 +13803,11 @@ void BlueStore::generate_db_histogram(Formatter *f) void BlueStore::_flush_cache() { dout(10) << __func__ << dendl; - for (auto i : cache_shards) { + for (auto i : onode_cache_shards) { + i->flush(); + ceph_assert(i->empty()); + } + for (auto i : buffer_cache_shards) { i->flush(); ceph_assert(i->empty()); } @@ -13731,7 +13833,10 @@ void BlueStore::_flush_cache() int BlueStore::flush_cache(ostream *os) { dout(10) << __func__ << dendl; - for (auto i : cache_shards) { + for (auto i : onode_cache_shards) { + i->flush(); + } + for (auto i : buffer_cache_shards) { i->flush(); } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index ac788116f5531..10d35ca7ca945 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -251,7 +251,7 @@ public: } }; - struct Cache; + struct BufferCacheShard; /// map logical extent range (object) onto buffers struct BufferSpace { @@ -279,7 +279,7 @@ public: ceph_assert(writing.empty()); } - void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) { + void _add_buffer(BufferCacheShard* cache, Buffer *b, int level, Buffer *near) { cache->_audit("_add_buffer start"); buffer_map[b->offset].reset(b); if (b->is_writing()) { @@ -299,21 +299,21 @@ public: } } else { b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); - cache->_add_buffer(b, level, near); + cache->_add(b, level, near); } cache->_audit("_add_buffer end"); } - void _rm_buffer(Cache* cache, Buffer *b) { + void _rm_buffer(BufferCacheShard* cache, Buffer *b) { _rm_buffer(cache, buffer_map.find(b->offset)); } - void _rm_buffer(Cache* cache, + void _rm_buffer(BufferCacheShard* cache, map>::iterator p) { ceph_assert(p != buffer_map.end()); cache->_audit("_rm_buffer start"); if (p->second->is_writing()) { writing.erase(writing.iterator_to(*p->second)); } else { - cache->_rm_buffer(p->second.get()); + cache->_rm(p->second.get()); } buffer_map.erase(p); cache->_audit("_rm_buffer end"); @@ -331,45 +331,47 @@ public: } // must be called under protection of the Cache lock - void _clear(Cache* cache); + void _clear(BufferCacheShard* cache); // return value is the highest cache_private of a trimmed buffer, or 0. - int discard(Cache* cache, uint32_t offset, uint32_t length) { + int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) { std::lock_guard l(cache->lock); - return _discard(cache, offset, length); + int ret = _discard(cache, offset, length); + cache->_trim(); + return ret; } - int _discard(Cache* cache, uint32_t offset, uint32_t length); + int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length); - void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl, + void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, bufferlist& bl, unsigned flags) { std::lock_guard l(cache->lock); Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl, flags); b->cache_private = _discard(cache, offset, bl.length()); _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr); - cache->_trim_buffers(); + cache->_trim(); } - void _finish_write(Cache* cache, uint64_t seq); - void did_read(Cache* cache, uint32_t offset, bufferlist& bl) { + void _finish_write(BufferCacheShard* cache, uint64_t seq); + void did_read(BufferCacheShard* cache, uint32_t offset, bufferlist& bl) { std::lock_guard l(cache->lock); Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl); b->cache_private = _discard(cache, offset, bl.length()); _add_buffer(cache, b, 1, nullptr); - cache->_trim_buffers(); + cache->_trim(); } - void read(Cache* cache, uint32_t offset, uint32_t length, + void read(BufferCacheShard* cache, uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, interval_set& res_intervals, int flags = 0); - void truncate(Cache* cache, uint32_t offset) { + void truncate(BufferCacheShard* cache, uint32_t offset) { discard(cache, offset, (uint32_t)-1 - offset); } - void split(Cache* cache, size_t pos, BufferSpace &r); + void split(BufferCacheShard* cache, size_t pos, BufferSpace &r); - void dump(Cache* cache, Formatter *f) const { + void dump(BufferCacheShard* cache, Formatter *f) const { std::lock_guard l(cache->lock); f->open_array_section("buffers"); for (auto& i : buffer_map) { @@ -433,7 +435,7 @@ public: friend bool operator==(const SharedBlob &l, const SharedBlob &r) { return l.get_sbid() == r.get_sbid(); } - inline Cache* get_cache() { + inline BufferCacheShard* get_cache() { return coll ? coll->cache : nullptr; } inline SharedBlobSet* get_parent() { @@ -1086,86 +1088,40 @@ public: }; typedef boost::intrusive_ptr OnodeRef; - - /// a cache (shard) of onodes and buffers - struct Cache { - CephContext* cct; + /// A generic Cache Shard + struct CacheShard { + CephContext *cct; PerfCounters *logger; /// protect lru and other structures ceph::recursive_mutex lock = { - ceph::make_recursive_mutex("BlueStore::Cache::lock") }; - - std::atomic num_extents = {0}; - std::atomic num_blobs = {0}; - std::atomic onode_max = {0}; - std::atomic buffer_max = {0}; - - std::array, 64> dumped_onodes; - - static Cache *create(CephContext* cct, string type, PerfCounters *logger); - - Cache(CephContext* cct) : cct(cct), logger(nullptr) {} - virtual ~Cache() {} - - virtual void _add_onode(OnodeRef& o, int level) = 0; - virtual void _rm_onode(OnodeRef& o) = 0; - virtual void _touch_onode(OnodeRef& o) = 0; - - virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0; - virtual void _rm_buffer(Buffer *b) = 0; - virtual void _move_buffer(Cache *src, Buffer *b) = 0; - virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0; - virtual void _touch_buffer(Buffer *b) = 0; + ceph::make_recursive_mutex("BlueStore::CacheShard::lock") }; - virtual uint64_t _get_num_onodes() = 0; - virtual uint64_t _get_buffer_bytes() = 0; + std::atomic max = {0}; + std::atomic num = {0}; - void add_extent() { - ++num_extents; - } - void rm_extent() { - --num_extents; - } - - void add_blob() { - ++num_blobs; - } - void rm_blob() { - --num_blobs; - } + CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {} + virtual ~CacheShard() {} - void set_onode_max(uint64_t max) { - onode_max = max; + void set_max(uint64_t max_) { + max = max_; } - void set_buffer_max(uint64_t max) { - buffer_max = max; + uint64_t _get_num() { + return num; } - void flush(); - void trim_onodes(); - void trim_buffers(); - - virtual void _trim_onodes_to(uint64_t max) = 0; - virtual void _trim_buffers_to(uint64_t max) = 0; - - void _trim_onodes() { - _trim_onodes_to(onode_max); + virtual void _trim_to(uint64_t max) = 0; + void _trim() { + _trim_to(max); } - - void _trim_buffers() { - _trim_buffers_to(buffer_max); + void trim() { + std::lock_guard l(lock); + _trim(); } - - virtual void add_stats(uint64_t *onodes, uint64_t *extents, - uint64_t *blobs, - uint64_t *buffers, - uint64_t *bytes) = 0; - - bool empty() { + void flush() { std::lock_guard l(lock); - return _get_num_onodes() == 0 && _get_buffer_bytes() == 0; + _trim_to(0); } #ifdef DEBUG_CACHE @@ -1175,206 +1131,79 @@ public: #endif }; - /// simple LRU cache for onodes and buffers - struct LRUCache : public Cache { - private: - typedef boost::intrusive::list< - Onode, - boost::intrusive::member_hook< - Onode, - boost::intrusive::list_member_hook<>, - &Onode::lru_item> > onode_lru_list_t; - typedef boost::intrusive::list< - Buffer, - boost::intrusive::member_hook< - Buffer, - boost::intrusive::list_member_hook<>, - &Buffer::lru_item> > buffer_lru_list_t; - - onode_lru_list_t onode_lru; - - buffer_lru_list_t buffer_lru; - uint64_t buffer_size = 0; - + /// A Generic onode Cache Shard + struct OnodeCacheShard : public CacheShard { + std::array, 64> dumped_onodes; public: - LRUCache(CephContext* cct) : Cache(cct) {} - uint64_t _get_num_onodes() override { - return onode_lru.size(); - } - void _add_onode(OnodeRef& o, int level) override { - if (level > 0) - onode_lru.push_front(*o); - else - onode_lru.push_back(*o); - } - void _rm_onode(OnodeRef& o) override { - auto q = onode_lru.iterator_to(*o); - onode_lru.erase(q); - } - void _touch_onode(OnodeRef& o) override; - - uint64_t _get_buffer_bytes() override { - return buffer_size; - } - void _add_buffer(Buffer *b, int level, Buffer *near) override { - if (near) { - auto q = buffer_lru.iterator_to(*near); - buffer_lru.insert(q, *b); - } else if (level > 0) { - buffer_lru.push_front(*b); - } else { - buffer_lru.push_back(*b); - } - buffer_size += b->length; - } - void _rm_buffer(Buffer *b) override { - ceph_assert(buffer_size >= b->length); - buffer_size -= b->length; - auto q = buffer_lru.iterator_to(*b); - buffer_lru.erase(q); - } - void _move_buffer(Cache *src, Buffer *b) override { - src->_rm_buffer(b); - _add_buffer(b, 0, nullptr); - } - void _adjust_buffer_size(Buffer *b, int64_t delta) override { - ceph_assert((int64_t)buffer_size + delta >= 0); - buffer_size += delta; - } - void _touch_buffer(Buffer *b) override { - auto p = buffer_lru.iterator_to(*b); - buffer_lru.erase(p); - buffer_lru.push_front(*b); - _audit("_touch_buffer end"); - } - - void _trim_onodes_to(uint64_t max) override; - void _trim_buffers_to(uint64_t max) override; + OnodeCacheShard(CephContext* cct) : CacheShard(cct) {} + static OnodeCacheShard *create(CephContext* cct, string type, + PerfCounters *logger); + virtual void _add(OnodeRef& o, int level) = 0; + virtual void _rm(OnodeRef& o) = 0; + virtual void _touch(OnodeRef& o) = 0; + virtual void add_stats(uint64_t *onodes) = 0; - void add_stats(uint64_t *onodes, uint64_t *extents, - uint64_t *blobs, - uint64_t *buffers, - uint64_t *bytes) override { - std::lock_guard l(lock); - *onodes += onode_lru.size(); - *extents += num_extents; - *blobs += num_blobs; - *buffers += buffer_lru.size(); - *bytes += buffer_size; + bool empty() { + return _get_num() == 0; } - -#ifdef DEBUG_CACHE - void _audit(const char *s) override; -#endif }; - // 2Q cache for buffers, LRU for onodes - struct TwoQCache : public Cache { - private: - // stick with LRU for onodes for now (fixme?) - typedef boost::intrusive::list< - Onode, - boost::intrusive::member_hook< - Onode, - boost::intrusive::list_member_hook<>, - &Onode::lru_item> > onode_lru_list_t; - typedef boost::intrusive::list< - Buffer, - boost::intrusive::member_hook< - Buffer, - boost::intrusive::list_member_hook<>, - &Buffer::lru_item> > buffer_list_t; - - onode_lru_list_t onode_lru; - - buffer_list_t buffer_hot; ///< "Am" hot buffers - buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers - buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted - uint64_t buffer_bytes = 0; ///< bytes - - enum { - BUFFER_NEW = 0, - BUFFER_WARM_IN, ///< in buffer_warm_in - BUFFER_WARM_OUT, ///< in buffer_warm_out - BUFFER_HOT, ///< in buffer_hot - BUFFER_TYPE_MAX - }; - - uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type + /// A Generic buffer Cache Shard + struct BufferCacheShard : public CacheShard { + std::atomic num_extents = {0}; + std::atomic num_blobs = {0}; + uint64_t buffer_bytes = 0; public: - TwoQCache(CephContext* cct) : Cache(cct) {} - uint64_t _get_num_onodes() override { - return onode_lru.size(); + BufferCacheShard(CephContext* cct) : CacheShard(cct) {} + static BufferCacheShard *create(CephContext* cct, string type, + PerfCounters *logger); + virtual void _add(Buffer *b, int level, Buffer *near) = 0; + virtual void _rm(Buffer *b) = 0; + virtual void _move(BufferCacheShard *src, Buffer *b) = 0; + virtual void _touch(Buffer *b) = 0; + virtual void _adjust_size(Buffer *b, int64_t delta) = 0; + + uint64_t _get_bytes() { + return buffer_bytes; } - void _add_onode(OnodeRef& o, int level) override { - if (level > 0) - onode_lru.push_front(*o); - else - onode_lru.push_back(*o); + + void add_extent() { + ++num_extents; } - void _rm_onode(OnodeRef& o) override { - auto q = onode_lru.iterator_to(*o); - onode_lru.erase(q); + void rm_extent() { + --num_extents; } - void _touch_onode(OnodeRef& o) override; - uint64_t _get_buffer_bytes() override { - return buffer_bytes; + void add_blob() { + ++num_blobs; } - void _add_buffer(Buffer *b, int level, Buffer *near) override; - void _rm_buffer(Buffer *b) override; - void _move_buffer(Cache *src, Buffer *b) override; - void _adjust_buffer_size(Buffer *b, int64_t delta) override; - void _touch_buffer(Buffer *b) override { - switch (b->cache_private) { - case BUFFER_WARM_IN: - // do nothing (somewhat counter-intuitively!) - break; - case BUFFER_WARM_OUT: - // move from warm_out to hot LRU - ceph_abort_msg("this happens via discard hint"); - break; - case BUFFER_HOT: - // move to front of hot LRU - buffer_hot.erase(buffer_hot.iterator_to(*b)); - buffer_hot.push_front(*b); - break; - } - _audit("_touch_buffer end"); + void rm_blob() { + --num_blobs; } - void _trim_onodes_to(uint64_t max) override; - void _trim_buffers_to(uint64_t max) override; + virtual void add_stats(uint64_t *extents, + uint64_t *blobs, + uint64_t *buffers, + uint64_t *bytes) = 0; - void add_stats(uint64_t *onodes, uint64_t *extents, - uint64_t *blobs, - uint64_t *buffers, - uint64_t *bytes) override { + bool empty() { std::lock_guard l(lock); - *onodes += onode_lru.size(); - *extents += num_extents; - *blobs += num_blobs; - *buffers += buffer_hot.size() + buffer_warm_in.size(); - *bytes += buffer_bytes; + return _get_bytes() == 0; } - -#ifdef DEBUG_CACHE - void _audit(const char *s) override; -#endif }; struct OnodeSpace { - private: - Cache *cache; + OnodeCacheShard *cache; + private: /// forward lookups mempool::bluestore_cache_other::unordered_map onode_map; friend class Collection; // for split_cache() public: - OnodeSpace(Cache *c) : cache(c) {} + OnodeSpace(OnodeCacheShard *c) : cache(c) {} ~OnodeSpace() { clear(); } @@ -1403,7 +1232,7 @@ public: struct Collection : public CollectionImpl { BlueStore *store; OpSequencerRef osr; - Cache *cache; ///< our cache shard + BufferCacheShard *cache; ///< our cache shard bluestore_cnode_t cnode; RWLock lock; @@ -1460,7 +1289,7 @@ public: void flush() override; void flush_all_but_last(); - Collection(BlueStore *ns, Cache *ca, coll_t c); + Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c); }; class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { @@ -1939,7 +1768,8 @@ private: mempool::bluestore_cache_other::unordered_map coll_map; map new_coll_map; - vector cache_shards; + vector onode_cache_shards; + vector buffer_cache_shards; /// protect zombie_osr_set ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock"); @@ -2149,8 +1979,8 @@ private: virtual uint64_t _get_used_bytes() const { uint64_t bytes = 0; - for (auto i : store->cache_shards) { - bytes += i->_get_buffer_bytes(); + for (auto i : store->buffer_cache_shards) { + bytes += i->_get_bytes(); } return bytes; } @@ -2470,18 +2300,22 @@ public: void set_cache_shards(unsigned num) override; void dump_cache_stats(Formatter *f) override { int onode_count = 0, buffers_bytes = 0; - for (auto i: cache_shards) { - onode_count += i->_get_num_onodes(); - buffers_bytes += i->_get_buffer_bytes(); + for (auto i: onode_cache_shards) { + onode_count += i->_get_num(); + } + for (auto i: buffer_cache_shards) { + buffers_bytes += i->_get_bytes(); } f->dump_int("bluestore_onode", onode_count); f->dump_int("bluestore_buffers", buffers_bytes); } void dump_cache_stats(ostream& ss) override { int onode_count = 0, buffers_bytes = 0; - for (auto i: cache_shards) { - onode_count += i->_get_num_onodes(); - buffers_bytes += i->_get_buffer_bytes(); + for (auto i: onode_cache_shards) { + onode_count += i->_get_num(); + } + for (auto i: buffer_cache_shards) { + buffers_bytes += i->_get_bytes(); } ss << "bluestore_onode: " << onode_count; ss << "bluestore_buffers: " << buffers_bytes; diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index 4e8c21a8d70e6..04bdc0dd3cd60 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -336,9 +336,12 @@ TEST(Blob, put_ref) { { BlueStore store(g_ceph_context, "", 4096); - BlueStore::Cache *cache = BlueStore::Cache::create( + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( g_ceph_context, "lru", NULL); - BlueStore::Collection coll(&store, cache, coll_t()); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + BlueStore::Collection coll(&store, oc, bc, coll_t()); BlueStore::Blob b; b.shared_blob = new BlueStore::SharedBlob(nullptr); b.shared_blob->get(); // hack to avoid dtor from running @@ -366,9 +369,11 @@ TEST(Blob, put_ref) unsigned mas = 4096; BlueStore store(g_ceph_context, "", 8192); - BlueStore::Cache *cache = BlueStore::Cache::create( + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( g_ceph_context, "lru", NULL); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t())); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); { BlueStore::Blob B; @@ -812,9 +817,12 @@ TEST(Blob, put_ref) } { BlueStore store(g_ceph_context, "", 0x4000); - BlueStore::Cache *cache = BlueStore::Cache::create( + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( g_ceph_context, "lru", NULL); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t())); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Blob B; B.shared_blob = new BlueStore::SharedBlob(nullptr); B.shared_blob->get(); // hack to avoid dtor from running @@ -899,9 +907,11 @@ TEST(bluestore_blob_t, prune_tail) TEST(Blob, split) { BlueStore store(g_ceph_context, "", 4096); - BlueStore::Cache *cache = BlueStore::Cache::create( - g_ceph_context, "lru", NULL); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t())); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); { BlueStore::Blob L, R; L.shared_blob = new BlueStore::SharedBlob(coll.get()); @@ -955,9 +965,11 @@ TEST(Blob, split) TEST(Blob, legacy_decode) { BlueStore store(g_ceph_context, "", 4096); - BlueStore::Cache *cache = BlueStore::Cache::create( + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( g_ceph_context, "lru", NULL); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t())); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); bufferlist bl, bl2; { BlueStore::Blob B; @@ -1033,8 +1045,12 @@ TEST(Blob, legacy_decode) TEST(ExtentMap, seek_lextent) { BlueStore store(g_ceph_context, "", 4096); - BlueStore::LRUCache cache(g_ceph_context); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); BlueStore::BlobRef br(new BlueStore::Blob); @@ -1082,8 +1098,11 @@ TEST(ExtentMap, seek_lextent) TEST(ExtentMap, has_any_lextents) { BlueStore store(g_ceph_context, "", 4096); - BlueStore::LRUCache cache(g_ceph_context); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); BlueStore::BlobRef b(new BlueStore::Blob); @@ -1129,8 +1148,12 @@ TEST(ExtentMap, has_any_lextents) TEST(ExtentMap, compress_extent_map) { BlueStore store(g_ceph_context, "", 4096); - BlueStore::LRUCache cache(g_ceph_context); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + +BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); BlueStore::BlobRef b1(new BlueStore::Blob); @@ -1181,9 +1204,13 @@ TEST(ExtentMap, compress_extent_map) TEST(GarbageCollector, BasicTest) { - BlueStore::LRUCache cache(g_ceph_context); + BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create( + g_ceph_context, "lru", NULL); + BlueStore store(g_ceph_context, "", 4096); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); @@ -1268,7 +1295,7 @@ TEST(GarbageCollector, BasicTest) */ { BlueStore store(g_ceph_context, "", 0x10000); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); @@ -1384,7 +1411,7 @@ TEST(GarbageCollector, BasicTest) */ { BlueStore store(g_ceph_context, "", 0x10000); - BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t())); + BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t())); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); BlueStore::ExtentMap em(&onode); -- 2.39.5