From: Igor Fedotov Date: Fri, 24 Jan 2020 22:18:47 +0000 (+0300) Subject: os/bluestore: simplify Onode pin/unpin logic. X-Git-Tag: v16.1.0~1756^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3a938749f352ce8e220314da9b6e344e67eaf1fd;p=ceph.git os/bluestore: simplify Onode pin/unpin logic. This patch makes in/unpin logic more straightforward and hopefully safe in multi-threading. Also it eliminates Onode's pin_list as one doesn't need it to track pinned Onodes. OnodeCacheShard reference has been removed from onode too. cached/pinned flags have been introduced. They're accessed under cache lock hence original race has been totally addressed. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 463628280c10..66a60d6620c1 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -790,77 +790,50 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { BlueStore::Onode, boost::intrusive::list_member_hook<>, &BlueStore::Onode::lru_item> > list_t; - typedef boost::intrusive::list< - BlueStore::Onode, - boost::intrusive::member_hook< - BlueStore::Onode, - boost::intrusive::list_member_hook<>, - &BlueStore::Onode::pin_item> > pin_list_t; list_t lru; - pin_list_t pin_list; explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {} - void _add(BlueStore::OnodeRef& o, int level) override + void _add(BlueStore::Onode* o, int level) override { - ceph_assert(o->s == nullptr); - o->s = this; - if (o->nref > 1) { - pin_list.push_front(*o); - o->pinned = true; - num_pinned = pin_list.size(); - } else { + if (o->put_cache()) { (level > 0) ? lru.push_front(*o) : lru.push_back(*o); + } else { + ++num_pinned; } - num = lru.size(); + ++num; // we count both pinned and unpinned entries } - void _rm(BlueStore::OnodeRef& o) override + void _rm(BlueStore::Onode* o) override { - o->s = nullptr; - if (o->pinned) { - o->pinned = false; - pin_list.erase(pin_list.iterator_to(*o)); - } else { + if (o->pop_cache()) { lru.erase(lru.iterator_to(*o)); + } else { + --num_pinned; } - num = lru.size(); - num_pinned = pin_list.size(); + --num; } - void _touch(BlueStore::OnodeRef& o) override + void _touch(BlueStore::Onode* o) override { - if (o->pinned) { - return; + ceph_assert(o->cached); + if (!o->pinned) { + lru.erase(lru.iterator_to(*o)); + lru.push_front(*o); } - lru.erase(lru.iterator_to(*o)); - lru.push_front(*o); - num = lru.size(); } - void _pin(BlueStore::Onode& o) override + void _pin(BlueStore::Onode* o) override { - if (o.pinned == true) { - return; - } - lru.erase(lru.iterator_to(o)); - pin_list.push_front(o); - o.pinned = true; - num = lru.size(); - num_pinned = pin_list.size(); - dout(30) << __func__ << " " << o.oid << " pinned" << dendl; - - } - void _unpin(BlueStore::Onode& o) override + lru.erase(lru.iterator_to(*o)); + ++num_pinned; + dout(30) << __func__ << " " << o->oid << " pinned" << dendl; + } + void _unpin(BlueStore::Onode* o) override { - if (o.pinned == false) { - return; - } - pin_list.erase(pin_list.iterator_to(o)); - lru.push_front(o); - o.pinned = false; - num = lru.size(); - num_pinned = pin_list.size(); - dout(30) << __func__ << " " << o.oid << " unpinned" << dendl; + lru.push_front(*o); + --num_pinned; + dout(30) << __func__ << " " << o->oid << " unpinned" << dendl; } + void _trim_to(uint64_t new_size) override { if (new_size >= lru.size()) { @@ -879,17 +852,17 @@ struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard { lru.erase(p); ceph_assert(n == 1); } - o->s = nullptr; - o->get(); // paranoia + if (!o->pop_cache()) { + --num_pinned; + } o->c->onode_map.remove(o->oid); - o->put(); --n; } num = lru.size(); } void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override { - *onodes += num + num_pinned; + *onodes += num; *pinned_onodes += num_pinned; } }; @@ -1584,7 +1557,8 @@ void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStor #undef dout_prefix #define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") " -BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o) +BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, + OnodeRef& o) { std::lock_guard l(cache->lock); auto p = onode_map.find(oid); @@ -1596,7 +1570,7 @@ BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o } ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl; onode_map[oid] = o; - cache->_add(o, 1); + cache->_add(o.get(), 1); cache->_trim(); return o; } @@ -1615,9 +1589,11 @@ BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) } else { ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second << dendl; - cache->_touch(p->second); - hit = true; + // assign before _touch to pin object if needed and avoid + // cache touch o = p->second; + cache->_touch(p->second.get()); + hit = true; } } @@ -1634,7 +1610,7 @@ void BlueStore::OnodeSpace::clear() std::lock_guard l(cache->lock); ldout(cache->cct, 10) << __func__ << dendl; for (auto &p : onode_map) { - cache->_rm(p.second); + cache->_rm(p.second.get()); } onode_map.clear(); } @@ -1663,7 +1639,7 @@ void BlueStore::OnodeSpace::rename( if (pn != onode_map.end()) { ldout(cache->cct, 30) << __func__ << " removing target " << pn->second << dendl; - cache->_rm(pn->second); + cache->_rm(pn->second.get()); onode_map.erase(pn); } OnodeRef o = po->second; @@ -1671,10 +1647,10 @@ void BlueStore::OnodeSpace::rename( // install a non-existent onode at old location oldo.reset(new Onode(o->c, old_oid, o->key)); po->second = oldo; - cache->_add(po->second, 1); + cache->_add(oldo.get(), 1); // add at new position and fix oid, key onode_map.insert(make_pair(new_oid, o)); - cache->_touch(o); + cache->_touch(o.get()); o->oid = new_oid; o->key = new_okey; cache->_trim(); @@ -3236,6 +3212,28 @@ BlueStore::BlobRef BlueStore::ExtentMap::split_blob( #undef dout_prefix #define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " " +void BlueStore::Onode::get() { + if (++nref == 2) { + c->get_onode_cache()->pin(this, [&]() { + bool r = cached && !pinned; + pinned = true; + return r; + }); + } +} +void BlueStore::Onode::put() { + int n = --nref; + if (n == 1) { + c->get_onode_cache()->unpin(this, [&]() { + bool r = cached && pinned; + pinned = false; + return r; + }); + } else if (n == 0) { + delete this; + } +} + BlueStore::Onode* BlueStore::Onode::decode( CollectionRef c, const ghobject_t& oid, @@ -3673,23 +3671,13 @@ void BlueStore::Collection::split_cache( ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid << dendl; - // move the onode to the new map before futzing with the cache - // shard, ensuring that nref is always >= 2, and no racing - // thread can trigger a pin or unpin (which does *not* behave - // well when we are clearing and resetting the 's' shard - // pointer!). + // ensuring that nref is always >= 2 and hence onode is pinned and + // physically out of cache during the transition + OnodeRef o_pin = o; + p = onode_map.onode_map.erase(p); dest->onode_map.onode_map[o->oid] = o; - - if (onode_map.cache != dest->onode_map.cache) { - // move onode to a different cache shard - onode_map.cache->_rm(o); - o->c = dest; - dest->onode_map.cache->_add(o, 1); - } else { - // the onode is in the same cache shard, making our move simpler. - o->c = dest; - } + o->c = dest; // move over shared blobs and buffers. cover shared blobs from // both extent map and spanning blob map (the full extent map diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index ff052737a806..24d83aafbd3f 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1055,13 +1055,9 @@ public: }; struct OnodeSpace; - struct OnodeCacheShard; /// an in-memory object struct Onode { MEMPOOL_CLASS_HELPERS(); - // Not persisted and updated on cache insertion/removal - OnodeCacheShard *s; - bool pinned = false; // Only to be used by the onode cache shard std::atomic_int nref; ///< reference count Collection *c; @@ -1070,11 +1066,15 @@ public: /// key under PREFIX_OBJ where we are stored mempool::bluestore_cache_other::string key; - boost::intrusive::list_member_hook<> lru_item, pin_item; + boost::intrusive::list_member_hook<> lru_item; bluestore_onode_t onode; ///< metadata stored as value in kv store bool exists; ///< true if object logically exists - + bool cached; ///< Onode is logically in the cache + /// (it can be pinned and hence physically out + /// of it at the moment though) + bool pinned; ///< Onode is pinned + /// (or should be pinned when cached) ExtentMap extent_map; // track txc's that have not been committed to kv store (and whose @@ -1087,32 +1087,35 @@ public: Onode(Collection *c, const ghobject_t& o, const mempool::bluestore_cache_other::string& k) - : s(nullptr), - nref(0), + : nref(0), c(c), oid(o), key(k), exists(false), + cached(false), + pinned(false), extent_map(this) { } Onode(Collection* c, const ghobject_t& o, const std::string& k) - : s(nullptr), - nref(0), + : nref(0), c(c), oid(o), key(k), exists(false), + cached(false), + pinned(false), extent_map(this) { } Onode(Collection* c, const ghobject_t& o, const char* k) - : s(nullptr), - nref(0), + : nref(0), c(c), oid(o), key(k), exists(false), + cached(false), + pinned(false), extent_map(this) { } @@ -1125,19 +1128,18 @@ public: void dump(ceph::Formatter* f) const; void flush(); - void get() { - if (++nref == 2 && s != nullptr) { - s->pin(*this); - } + void get(); + void put(); + + inline bool put_cache() { + ceph_assert(!cached); + cached = true; + return !pinned; } - void put() { - int n = --nref; - if (n == 1 && s != nullptr) { - s->unpin(*this); - } - if (n == 0) { - delete this; - } + inline bool pop_cache() { + ceph_assert(cached); + cached = false; + return !pinned; } const std::string& get_omap_prefix(); @@ -1204,24 +1206,30 @@ public: std::atomic num_pinned = {0}; std::array, 64> dumped_onodes; + + virtual void _pin(Onode* o) = 0; + virtual void _unpin(Onode* o) = 0; + public: OnodeCacheShard(CephContext* cct) : CacheShard(cct) {} static OnodeCacheShard *create(CephContext* cct, std::string type, PerfCounters *logger); - virtual void _add(OnodeRef& o, int level) = 0; - virtual void _rm(OnodeRef& o) = 0; - virtual void _touch(OnodeRef& o) = 0; - virtual void _pin(Onode& o) = 0; - virtual void _unpin(Onode& o) = 0; + virtual void _add(Onode* o, int level) = 0; + virtual void _rm(Onode* o) = 0; + virtual void _touch(Onode* o) = 0; - void pin(Onode& o) { + void pin(Onode* o, std::function validator) { std::lock_guard l(lock); - _pin(o); + if (validator()) { + _pin(o); + } } - void unpin(Onode& o) { + void unpin(Onode* o, std::function validator) { std::lock_guard l(lock); - _unpin(o); + if (validator()) { + _unpin(o); + } } virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0; @@ -1290,7 +1298,7 @@ public: clear(); } - OnodeRef add(const ghobject_t& oid, OnodeRef o); + OnodeRef add(const ghobject_t& oid, OnodeRef& o); OnodeRef lookup(const ghobject_t& o); void remove(const ghobject_t& oid) { onode_map.erase(oid); @@ -1331,6 +1339,9 @@ public: pool_opts_t pool_opts; ContextQueue *commit_queue; + OnodeCacheShard* get_onode_cache() const { + return onode_map.cache; + } OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false); // the terminology is confusing here, sorry!