From: Igor Fedotov Date: Thu, 18 Aug 2022 23:22:45 +0000 (+0300) Subject: os/bluestore: fix onode ref counting X-Git-Tag: v18.1.0~404^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a3057f4c373c935b22492d5ca444ce9d7933fe3a;p=ceph.git os/bluestore: fix onode ref counting This should eliminate duplicate onode releases that could happen before. Additionally onode pinning is performed during cache trimming not onode ref count increment. [Hopefully] fixes: https://tracker.ceph.com/issues/53002 Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 345840cc6742..80f026747c2f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1111,12 +1111,11 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { void _add(BlueStore::Onode* o, int level) override { - if (o->put_cache()) { + o->set_cached(); + if (o->pin_nref == 1) { (level > 0) ? lru.push_front(*o) : lru.push_back(*o); o->cache_age_bin = age_bins.front(); *(o->cache_age_bin) += 1; - } else { - ++num_pinned; } ++num; // we count both pinned and unpinned entries dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" @@ -1124,41 +1123,50 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { } void _rm(BlueStore::Onode* o) override { - if (o->pop_cache()) { + o->clear_cached(); + if (o->lru_item.is_linked()) { *(o->cache_age_bin) -= 1; lru.erase(lru.iterator_to(*o)); - } else { - ceph_assert(num_pinned); - --num_pinned; } ceph_assert(num); --num; dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl; } - void _pin(BlueStore::Onode* o) override - { - *(o->cache_age_bin) -= 1; - lru.erase(lru.iterator_to(*o)); - ++num_pinned; - dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " pinned" << dendl; - } - void _unpin(BlueStore::Onode* o) override - { - lru.push_front(*o); - o->cache_age_bin = age_bins.front(); - *(o->cache_age_bin) += 1; - ceph_assert(num_pinned); - --num_pinned; - dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << " unpinned" << dendl; - } - void _unpin_and_rm(BlueStore::Onode* o) override + + void maybe_unpin(BlueStore::Onode* o) override { - o->pop_cache(); - ceph_assert(num_pinned); - --num_pinned; - ceph_assert(num); - --num; + OnodeCacheShard* ocs = this; + ocs->lock.lock(); + // It is possible that during waiting split_cache moved us to different OnodeCacheShard. + while (ocs != o->c->get_onode_cache()) { + ocs->lock.unlock(); + ocs = o->c->get_onode_cache(); + ocs->lock.lock(); + } + if (o->is_cached() && o->pin_nref == 1) { + if(!o->lru_item.is_linked()) { + if (o->exists) { + lru.push_front(*o); + o->cache_age_bin = age_bins.front(); + *(o->cache_age_bin) += 1; + dout(20) << __func__ << " " << this << " " << o->oid << " unpinned" + << dendl; + } else { + ceph_assert(num); + --num; + o->clear_cached(); + // remove will also decrement nref + o->c->onode_space._remove(o->oid); + } + } else if (o->exists) { + // move onode within LRU + _rm(o); + _add(o, 1); + } + } + ocs->lock.unlock(); } + void _trim_to(uint64_t new_size) override { if (new_size >= lru.size()) { @@ -1169,11 +1177,11 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { ceph_assert(p != lru.begin()); --p; ceph_assert(num >= n); - num -= n; while (n-- > 0) { BlueStore::Onode *o = &*p; dout(20) << __func__ << " rm " << o->oid << " " - << o->nref << " " << o->cached << " " << o->pinned << dendl; + << o->nref << " " << o->cached << dendl; + if (p != lru.begin()) { lru.erase(p--); } else { @@ -1181,29 +1189,30 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { lru.erase(p); } *(o->cache_age_bin) -= 1; - auto pinned = !o->pop_cache(); - ceph_assert(!pinned); - o->c->onode_space._remove(o->oid); + if (o->pin_nref > 1) { + dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl; + } else { + ceph_assert(num); + --num; + o->clear_cached(); + o->c->onode_space._remove(o->oid); + } } } - void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override + void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override { if (to == this) { return; } - ceph_assert(o->cached); - ceph_assert(o->pinned); - ceph_assert(num); - ceph_assert(num_pinned); - --num_pinned; - --num; - ++to->num_pinned; - ++to->num; + _rm(o); + ceph_assert(o->nref > 1); + to->_add(o, 0); } void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override { + std::lock_guard l(lock); *onodes += num; - *pinned_onodes += num_pinned; + *pinned_onodes += num - lru.size(); } }; @@ -1963,12 +1972,10 @@ BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second << " " << p->second->nref << " " << p->second->cached - << " " << p->second->pinned << dendl; // This will pin onode and implicitly touch the cache when Onode // eventually will become unpinned o = p->second; - ceph_assert(!o->cached || o->pinned); cache->logger->inc(l_bluestore_onode_hits); } @@ -2024,7 +2031,6 @@ void BlueStore::OnodeSpace::rename( // This will pin 'o' and implicitly touch cache // when it will eventually become unpinned onode_map.insert(make_pair(new_oid, o)); - ceph_assert(o->pinned); o->oid = new_oid; o->key = new_okey; @@ -2050,7 +2056,6 @@ void BlueStore::OnodeSpace::dump(CephContext *cct) ldout(cct, LogLevelV) << i.first << " : " << i.second << " " << i.second->nref << " " << i.second->cached - << " " << i.second->pinned << dendl; } } @@ -3703,53 +3708,17 @@ void BlueStore::Onode::calc_omap_tail( out->push_back('~'); } -void BlueStore::Onode::get() { - if (++nref >= 2 && !pinned) { - OnodeCacheShard* ocs = c->get_onode_cache(); - ocs->lock.lock(); - // It is possible that during waiting split_cache moved us to different OnodeCacheShard. - while (ocs != c->get_onode_cache()) { - ocs->lock.unlock(); - ocs = c->get_onode_cache(); - ocs->lock.lock(); - } - bool was_pinned = pinned; - pinned = nref >= 2; - bool r = !was_pinned && pinned; - if (cached && r) { - ocs->_pin(this); - } - ocs->lock.unlock(); - } +void BlueStore::Onode::get() +{ + ++nref; + ++pin_nref; } -void BlueStore::Onode::put() { - ++put_nref; - int n = --nref; - if (n == 1) { - OnodeCacheShard* ocs = c->get_onode_cache(); - ocs->lock.lock(); - // It is possible that during waiting split_cache moved us to different OnodeCacheShard. - while (ocs != c->get_onode_cache()) { - ocs->lock.unlock(); - ocs = c->get_onode_cache(); - ocs->lock.lock(); - } - bool need_unpin = pinned; - pinned = pinned && nref >= 2; - need_unpin = need_unpin && !pinned; - if (cached && need_unpin) { - if (exists) { - ocs->_unpin(this); - } else { - ocs->_unpin_and_rm(this); - // remove will also decrement nref - c->onode_space._remove(oid); - } - } - ocs->lock.unlock(); +void BlueStore::Onode::put() +{ + if (--pin_nref == 1) { + c->get_onode_cache()->maybe_unpin(this); } - auto pn = --put_nref; - if (nref == 0 && pn == 0) { + if (--nref == 0) { delete this; } } @@ -4177,15 +4146,13 @@ void BlueStore::Collection::split_cache( ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid << dendl; - // ensuring that nref is always >= 2 and hence onode is pinned and - // physically out of cache during the transition + // ensuring that nref is always >= 2 and hence onode is pinned OnodeRef o_pin = o; - ceph_assert(o->pinned); p = onode_space.onode_map.erase(p); dest->onode_space.onode_map[o->oid] = o; if (o->cached) { - get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get()); + get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get()); } o->c = dest; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index b1dd71db9395..5d1da94f36ee 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1176,12 +1176,13 @@ public: }; struct OnodeSpace; + struct OnodeCacheShard; /// an in-memory object struct Onode { MEMPOOL_CLASS_HELPERS(); - std::atomic_int nref; ///< reference count - std::atomic_int put_nref = {0}; + std::atomic_int nref = 0; ///< reference count + std::atomic_int pin_nref = 0; ///< reference count replica to track pinning Collection *c; ghobject_t oid; @@ -1195,8 +1196,6 @@ public: bool cached; ///< Onode is logically in the cache /// (it can be pinned and hence physically out /// of it at the moment though) - std::atomic_bool pinned; ///< Onode is pinned - /// (or should be pinned when cached) ExtentMap extent_map; // track txc's that have not been committed to kv store (and whose @@ -1210,49 +1209,41 @@ public: Onode(Collection *c, const ghobject_t& o, const mempool::bluestore_cache_meta::string& k) - : nref(0), - c(c), + : c(c), oid(o), key(k), exists(false), cached(false), - pinned(false), extent_map(this, c->store->cct->_conf-> bluestore_extent_map_inline_shard_prealloc_size) { } Onode(Collection* c, const ghobject_t& o, const std::string& k) - : nref(0), - c(c), + : c(c), oid(o), key(k), exists(false), cached(false), - pinned(false), extent_map(this, c->store->cct->_conf-> bluestore_extent_map_inline_shard_prealloc_size) { } Onode(Collection* c, const ghobject_t& o, const char* k) - : nref(0), - c(c), + : c(c), oid(o), key(k), exists(false), cached(false), - pinned(false), extent_map(this, c->store->cct->_conf-> bluestore_extent_map_inline_shard_prealloc_size) { } Onode(CephContext* cct) - : nref(0), - c(nullptr), + : c(nullptr), exists(false), cached(false), - pinned(false), extent_map(this, cct->_conf-> bluestore_extent_map_inline_shard_prealloc_size) { @@ -1275,15 +1266,16 @@ public: void get(); void put(); - inline bool put_cache() { + inline bool is_cached() const { + return cached; + } + inline void set_cached() { ceph_assert(!cached); cached = true; - return !pinned; } - inline bool pop_cache() { + inline void clear_cached() { ceph_assert(cached); cached = false; - return !pinned; } static const std::string& calc_omap_prefix(uint8_t flags); @@ -1407,21 +1399,20 @@ private: /// A Generic onode Cache Shard struct OnodeCacheShard : public CacheShard { - std::atomic num_pinned = {0}; std::array, 64> dumped_onodes; - virtual void _pin(Onode* o) = 0; - virtual void _unpin(Onode* o) = 0; - public: OnodeCacheShard(CephContext* cct) : CacheShard(cct) {} static OnodeCacheShard *create(CephContext* cct, std::string type, PerfCounters *logger); + + //The following methods prefixed with '_' to be called under + // Shard's lock virtual void _add(Onode* o, int level) = 0; virtual void _rm(Onode* o) = 0; - virtual void _unpin_and_rm(Onode* o) = 0; + virtual void _move_pinned(OnodeCacheShard *to, Onode *o) = 0; - virtual void move_pinned(OnodeCacheShard *to, Onode *o) = 0; + virtual void maybe_unpin(Onode* o) = 0; virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0; bool empty() { return _get_num() == 0;