From: Sage Weil Date: Thu, 2 Jun 2016 19:53:32 +0000 (-0400) Subject: os/bluestore: unify lrus into single Cache class X-Git-Tag: v11.0.0~302^2~6 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=51c2088119499eae183b553f158ff5ff24abef6d;p=ceph-ci.git os/bluestore: unify lrus into single Cache class This will be sharded soon, but for now there's just one instance. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 267aff753df..f17d61f386f 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -960,8 +960,8 @@ OPTION(bluestore_compression, OPT_STR, "none") // force|aggressive|passive|none OPTION(bluestore_compression_algorithm, OPT_STR, "snappy") OPTION(bluestore_compression_min_blob_size, OPT_U32, 256*1024) OPTION(bluestore_compression_max_blob_size, OPT_U32, 4*1024*1024) -OPTION(bluestore_onode_map_size, OPT_U32, 1024) // onodes per collection -OPTION(bluestore_collection_buffer_cache_size, OPT_U32, 16*1024*1024) // per collection!! +OPTION(bluestore_onode_cache_size, OPT_U32, 16*1024) +OPTION(bluestore_buffer_cache_size, OPT_U32, 256*1024*1024) OPTION(bluestore_cache_tails, OPT_BOOL, true) // cache tail blocks in Onode OPTION(bluestore_kvbackend, OPT_STR, "rocksdb") OPTION(bluestore_allocator, OPT_STR, "stupid") // or "bitmap" diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 672796168a3..74dd92312e1 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -436,10 +436,8 @@ static void get_wal_key(uint64_t seq, string *out) _key_encode_u64(seq, out); } -// BufferCache -#undef dout_prefix -#define dout_prefix *_dout << "bluestore.BufferCache(" << this << ") " +// Buffer ostream& operator<<(ostream& out, const BlueStore::Buffer& b) { @@ -451,25 +449,45 @@ ostream& operator<<(ostream& out, const BlueStore::Buffer& b) return out << ")"; } -void BlueStore::BufferCache::trim(uint64_t keep) + +// Cache +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.Cache(" << this << ") " + +void BlueStore::Cache::_touch_onode(OnodeRef& o) +{ + auto p = onode_lru.iterator_to(*o); + onode_lru.erase(p); + onode_lru.push_front(*o); +} + +void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max) { - audit_lru(); - auto i = lru.end(); - if (size) { - assert(i != lru.begin()); + std::lock_guard l(lock); + + dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max + << " buffers " << buffer_size << " / " << buffer_max + << dendl; + + _audit_lru(); + + // buffers + auto i = buffer_lru.end(); + if (buffer_size) { + assert(i != buffer_lru.begin()); --i; } - while (size > keep) { + while (buffer_size > buffer_max) { Buffer *b = &*i; if (b->is_clean()) { auto p = b->space->buffer_map.find(b->offset); - if (i != lru.begin()) { + if (i != buffer_lru.begin()) { --i; } dout(20) << __func__ << " rm " << *b << dendl; b->space->_rm_buffer(p); } else { - if (i != lru.begin()) { + if (i != buffer_lru.begin()) { --i; continue; } else { @@ -477,24 +495,55 @@ void BlueStore::BufferCache::trim(uint64_t keep) } } } + + // onodes + int num = onode_lru.size() - onode_max; + if (num <= 0) + return; // don't even try + + auto p = onode_lru.end(); + if (num) + --p; + while (num > 0) { + Onode *o = &*p; + int refs = o->nref.load(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs; stopping with " << num << " left to trim" << dendl; + break; + } + dout(30) << __func__ << " trim " << o->oid << dendl; + if (p != onode_lru.begin()) { + onode_lru.erase(p--); + } else { + onode_lru.erase(p); + assert(num == 1); + } + o->get(); // paranoia + o->space->onode_map.erase(o->oid); + o->bc._clear(); // clear buffers, too + o->put(); + --num; + } } #ifdef DEBUG_CACHE -void BlueStore::BufferCache::audit_lru() +void BlueStore::Cache::_audit_lru() { if (true) { uint64_t s = 0; - for (auto i = lru.begin(); i != lru.end(); ++i) { + for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { s += i->length; } - if (s != size) { - derr << __func__ << " size " << size << " actual " << s << dendl; - for (auto i = lru.begin(); i != lru.end(); ++i) { + if (s != buffer_size) { + derr << __func__ << " buffer_size " << buffer_size << " actual " << s + << dendl; + for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) { derr << __func__ << " " << *i << dendl; } - assert(s == size); + assert(s == buffer_size); } - dout(20) << __func__ << " size " << size << " ok" << dendl; + dout(20) << __func__ << " buffer_size " << buffer_size << " ok" << dendl; } } #endif @@ -502,11 +551,21 @@ void BlueStore::BufferCache::audit_lru() // BufferSpace #undef dout_prefix -#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << ") " +#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") " + +void BlueStore::BufferSpace::_clear() +{ + // note: we already hold cache->lock + dout(10) << __func__ << dendl; + while (!buffer_map.empty()) { + _rm_buffer(buffer_map.begin()); + } +} -void BlueStore::BufferSpace::discard(uint64_t offset, uint64_t length) +void BlueStore::BufferSpace::_discard(uint64_t offset, uint64_t length) { - cache->audit_lru(); + std::lock_guard l(cache->lock); + cache->_audit_lru(); auto i = _data_lower_bound(offset); uint64_t end = offset + length; while (i != buffer_map.end()) { @@ -526,13 +585,13 @@ void BlueStore::BufferSpace::discard(uint64_t offset, uint64_t length) } else { _add_buffer(new Buffer(this, b->state, b->seq, end, tail)); } - cache->size -= b->length - front; + cache->buffer_size -= b->length - front; b->truncate(front); - cache->audit_lru(); + cache->_audit_lru(); return; } else { // drop tail - cache->size -= b->length - front; + cache->buffer_size -= b->length - front; b->truncate(front); ++i; continue; @@ -554,7 +613,7 @@ void BlueStore::BufferSpace::discard(uint64_t offset, uint64_t length) _add_buffer(new Buffer(this, b->state, b->seq, end, keep)); _rm_buffer(i); } - cache->audit_lru(); + cache->_audit_lru(); return; } } @@ -564,6 +623,7 @@ void BlueStore::BufferSpace::read( BlueStore::ready_regions_t& res, interval_set& res_intervals) { + std::lock_guard l(cache->lock); res.clear(); uint64_t end = offset + length; for (auto i = _data_lower_bound(offset); @@ -579,6 +639,7 @@ void BlueStore::BufferSpace::read( res_intervals.insert(offset, l); offset += l; length -= l; + cache->_touch_buffer(b); continue; } if (b->offset > offset) { @@ -601,12 +662,14 @@ void BlueStore::BufferSpace::read( offset += b->length; length -= b->length; } + cache->_touch_buffer(b); } } } void BlueStore::BufferSpace::finish_write(uint64_t seq) { + std::lock_guard l(cache->lock); auto i = writing.begin(); while (i != writing.end()) { Buffer *b = &*i; @@ -624,63 +687,26 @@ void BlueStore::BufferSpace::finish_write(uint64_t seq) ++i; } } - cache->audit_lru(); -} - - -// Bnode - -#undef dout_prefix -#define dout_prefix *_dout << "bluestore.bnode(" << this << ") " - -void BlueStore::Bnode::put() -{ - if (--nref == 0) { - dout(20) << __func__ << " removing self from set " << bnode_set << dendl; - bnode_set->uset.erase(*this); - delete this; - } -} - -// Onode - -#undef dout_prefix -#define dout_prefix *_dout << "bluestore.onode(" << this << ") " - -void BlueStore::Onode::flush() -{ - std::unique_lock l(flush_lock); - dout(20) << __func__ << " " << flush_txns << dendl; - while (!flush_txns.empty()) - flush_cond.wait(l); - dout(20) << __func__ << " done" << dendl; + cache->_audit_lru(); } -// OnodeHashLRU +// OnodeSpace #undef dout_prefix -#define dout_prefix *_dout << "bluestore.lru(" << this << ") " - -void BlueStore::OnodeHashLRU::_touch(OnodeRef o) -{ - auto p = lru.iterator_to(*o); - lru.erase(p); - lru.push_front(*o); -} +#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") " -void BlueStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o) +void BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o) { - std::lock_guard l(lock); + std::lock_guard l(cache->lock); dout(30) << __func__ << " " << oid << " " << o << dendl; assert(onode_map.count(oid) == 0); onode_map[oid] = o; - lru.push_front(*o); - _trim(max_size); + cache->onode_lru.push_front(*o); } -BlueStore::OnodeRef BlueStore::OnodeHashLRU::lookup(const ghobject_t& oid) +BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid) { - std::lock_guard l(lock); + std::lock_guard l(cache->lock); dout(30) << __func__ << dendl; ceph::unordered_map::iterator p = onode_map.find(oid); if (p == onode_map.end()) { @@ -688,23 +714,29 @@ BlueStore::OnodeRef BlueStore::OnodeHashLRU::lookup(const ghobject_t& oid) return OnodeRef(); } dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; - _touch(p->second); + cache->_touch_onode(p->second); return p->second; } -void BlueStore::OnodeHashLRU::clear() +void BlueStore::OnodeSpace::clear() { - std::lock_guard l(lock); + std::lock_guard l(cache->lock); dout(10) << __func__ << dendl; - lru.clear(); + for (auto &p : onode_map) { + auto q = cache->onode_lru.iterator_to(*p.second); + cache->onode_lru.erase(q); + + // clear buffers too, while we have cache->lock + p.second->bc._clear(); + } onode_map.clear(); } -void BlueStore::OnodeHashLRU::rename(OnodeRef& oldo, +void BlueStore::OnodeSpace::rename(OnodeRef& oldo, const ghobject_t& old_oid, const ghobject_t& new_oid) { - std::lock_guard l(lock); + std::lock_guard l(cache->lock); dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl; ceph::unordered_map::iterator po, pn; po = onode_map.find(old_oid); @@ -714,33 +746,33 @@ void BlueStore::OnodeHashLRU::rename(OnodeRef& oldo, assert(po != onode_map.end()); if (pn != onode_map.end()) { dout(30) << __func__ << " removing target " << pn->second << dendl; - auto p = lru.iterator_to(*pn->second); - lru.erase(p); + auto p = cache->onode_lru.iterator_to(*pn->second); + cache->onode_lru.erase(p); onode_map.erase(pn); } OnodeRef o = po->second; // install a non-existent onode at old location - oldo.reset(new Onode(old_oid, o->key, o->bc.cache)); + oldo.reset(new Onode(this, old_oid, o->key, o->bc.cache)); po->second = oldo; - lru.push_back(*po->second); + cache->onode_lru.push_back(*po->second); // add at new position and fix oid, key onode_map.insert(make_pair(new_oid, o)); - _touch(o); + cache->_touch_onode(o); o->oid = new_oid; get_object_key(new_oid, &o->key); } -bool BlueStore::OnodeHashLRU::get_next( +bool BlueStore::OnodeSpace::get_next( const ghobject_t& after, pair *next) { - std::lock_guard l(lock); + std::lock_guard l(cache->lock); dout(20) << __func__ << " after " << after << dendl; if (after == ghobject_t()) { - if (lru.empty()) { + if (cache->onode_lru.empty()) { return false; } ceph::unordered_map::iterator p = onode_map.begin(); @@ -752,9 +784,9 @@ bool BlueStore::OnodeHashLRU::get_next( ceph::unordered_map::iterator p = onode_map.find(after); assert(p != onode_map.end()); // for now - auto pi = lru.iterator_to(*p->second); + auto pi = cache->onode_lru.iterator_to(*p->second); ++pi; - if (pi == lru.end()) { + if (pi == cache->onode_lru.end()) { return false; } next->first = pi->oid; @@ -762,50 +794,37 @@ bool BlueStore::OnodeHashLRU::get_next( return true; } -int BlueStore::OnodeHashLRU::trim(int max) + +// Bnode + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.bnode(" << this << ") " + +void BlueStore::Bnode::put() { - std::lock_guard l(lock); - if (max < 0) { - max = max_size; + if (--nref == 0) { + dout(20) << __func__ << " removing self from set " << bnode_set << dendl; + bnode_set->uset.erase(*this); + delete this; } - return _trim(max); } -int BlueStore::OnodeHashLRU::_trim(int max) +// Onode + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.onode(" << this << ") " + +void BlueStore::Onode::flush() { - dout(20) << __func__ << " max " << max << " size " << onode_map.size() << dendl; - int trimmed = 0; - int num = onode_map.size() - max; - if (onode_map.size() == 0 || num <= 0) - return 0; // don't even try - - auto p = lru.end(); - if (num) - --p; - while (num > 0) { - Onode *o = &*p; - int refs = o->nref.load(); - if (refs > 1) { - dout(20) << __func__ << " " << o->oid << " has " << refs - << " refs; stopping with " << num << " left to trim" << dendl; - break; - } - dout(30) << __func__ << " trim " << o->oid << dendl; - if (p != lru.begin()) { - lru.erase(p--); - } else { - lru.erase(p); - assert(num == 1); - } - o->get(); // paranoia - onode_map.erase(o->oid); - o->put(); - --num; - ++trimmed; - } - return trimmed; + std::unique_lock l(flush_lock); + dout(20) << __func__ << " " << flush_txns << dendl; + while (!flush_txns.empty()) + flush_cond.wait(l); + dout(20) << __func__ << " done" << dendl; } + + // ======================================================= // Collection @@ -818,8 +837,8 @@ BlueStore::Collection::Collection(BlueStore *ns, coll_t c) cid(c), lock("BlueStore::Collection::lock", true, false), exists(true), - bnode_set(g_conf->bluestore_onode_map_size), - onode_map(g_conf->bluestore_onode_map_size) + bnode_set(MAX(16, g_conf->bluestore_onode_cache_size / 128)), + onode_map(&ns->cache) { } @@ -896,11 +915,11 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode( return OnodeRef(); // new - on = new Onode(oid, key, &buffer_cache); + on = new Onode(&onode_map, oid, key, &cache); } else { // loaded assert(r >=0); - on = new Onode(oid, key, &buffer_cache); + on = new Onode(&onode_map, oid, key, &cache); on->exists = true; bufferlist::iterator p = v.begin(); ::decode(on->onode, p); @@ -4443,13 +4462,6 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) break; } - if (txc->first_collection) { - RWLock::WLocker l(txc->first_collection->lock); - txc->first_collection->onode_map.trim(); - txc->first_collection->buffer_cache.trim( - g_conf->bluestore_collection_buffer_cache_size); - } - osr->q.pop_front(); txc->log_state_latency(logger, l_bluestore_state_done_lat); delete txc; @@ -4457,6 +4469,9 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) if (osr->q.empty()) dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; } + + cache.trim(g_conf->bluestore_onode_cache_size, + g_conf->bluestore_buffer_cache_size); } void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) @@ -4846,10 +4861,6 @@ void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) for (vector::iterator p = i.colls.begin(); p != i.colls.end(); ++p, ++j) { cvec[j] = _get_collection(*p); - - // note first collection we reference - if (!j && !txc->first_collection) - txc->first_collection = cvec[j]; } vector ovec(i.objects.size()); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 0df6ff72c3f..a11561b4fe0 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -101,8 +101,9 @@ public: typedef map extents2read_t; typedef map ready_regions_t; - /// cached buffer struct BufferSpace; + + /// cached buffer struct Buffer { enum { STATE_UNDEF = 0, @@ -181,26 +182,7 @@ public: } }; - /// manage a collection of buffers (per-collection, currently) - struct BufferCache { - typedef boost::intrusive::list< - Buffer, - boost::intrusive::member_hook< - Buffer, - boost::intrusive::list_member_hook<>, - &Buffer::lru_item> > buffer_lru_list_t; - - buffer_lru_list_t lru; - uint64_t size = 0; - - void trim(uint64_t keep); - -#ifdef DEBUG_CACHE - void audit_lru(); -#else - void audit_lru() { /* no-op */ } -#endif - }; + struct Cache; /// map logical extent range (object) onto buffers struct BufferSpace { @@ -212,38 +194,33 @@ public: &Buffer::state_item> > state_list_t; map> buffer_map; - BufferCache *cache; + Cache *cache; state_list_t writing; - BufferSpace(BufferCache *c) : cache(c) {} + BufferSpace(Cache *c) : cache(c) {} void _add_buffer(Buffer *b) { + cache->_audit_lru(); buffer_map[b->offset].reset(b); - cache->lru.push_front(*b); - cache->size += b->length; + cache->buffer_lru.push_front(*b); + cache->buffer_size += b->length; if (b->is_writing()) { writing.push_back(*b); } - cache->audit_lru(); + cache->_audit_lru(); } void _rm_buffer(Buffer *b) { _rm_buffer(buffer_map.find(b->offset)); } void _rm_buffer(map>::iterator p) { - cache->size -= p->second->length; - cache->lru.erase(cache->lru.iterator_to(*p->second)); + cache->_audit_lru(); + cache->buffer_size -= p->second->length; + cache->buffer_lru.erase(cache->buffer_lru.iterator_to(*p->second)); if (p->second->is_writing()) { writing.erase(writing.iterator_to(*p->second)); } buffer_map.erase(p); - cache->audit_lru(); - } - - /// move to top of lru - void _touch_buffer(Buffer *b) { - auto p = cache->lru.iterator_to(*b); - cache->lru.erase(p); - cache->lru.push_front(*b); + cache->_audit_lru(); } map>::iterator _data_lower_bound( @@ -261,16 +238,24 @@ public: return buffer_map.empty(); } - void discard(uint64_t offset, uint64_t length); + void _clear(); + + void discard(uint64_t offset, uint64_t length) { + std::lock_guard l(cache->lock); + _discard(offset, length); + } + void _discard(uint64_t offset, uint64_t length); void write(uint64_t seq, uint64_t offset, bufferlist& bl, unsigned flags) { - discard(offset, bl.length()); + std::lock_guard l(cache->lock); + _discard(offset, bl.length()); _add_buffer(new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl, flags)); } void finish_write(uint64_t seq); void did_read(uint64_t offset, bufferlist& bl) { - discard(offset, bl.length()); + std::lock_guard l(cache->lock); + _discard(offset, bl.length()); _add_buffer(new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl)); } @@ -283,6 +268,7 @@ public: } void dump(Formatter *f) const { + std::lock_guard l(cache->lock); f->open_array_section("buffers"); for (auto& i : buffer_map) { f->open_object_section("buffer"); @@ -294,9 +280,9 @@ public: } }; - /// an in-memory extent-map, shared by a group of objects (w/ same hash value) struct BnodeSet; + /// an in-memory extent-map, shared by a group of objects (w/ same hash value) struct Bnode : public boost::intrusive::unordered_set_base_hook<> { std::atomic_int nref; ///< reference count uint32_t hash; @@ -360,12 +346,16 @@ public: } }; + struct OnodeSpace; + /// an in-memory object struct Onode { std::atomic_int nref; ///< reference count ghobject_t oid; string key; ///< key under PREFIX_OBJ where we are stored + + OnodeSpace *space; ///< containing OnodeSpace boost::intrusive::list_member_hook<> lru_item; BnodeRef bnode; ///< ref to Bnode [optional] @@ -379,10 +369,11 @@ public: BufferSpace bc; - Onode(const ghobject_t& o, const string& k, BufferCache *c) + Onode(OnodeSpace *s, const ghobject_t& o, const string& k, Cache *c) : nref(0), oid(o), key(k), + space(s), exists(false), bc(c) { } @@ -407,7 +398,14 @@ public: }; typedef boost::intrusive_ptr OnodeRef; - struct OnodeHashLRU { + /// a cache (shard) of onodes and buffers + struct Cache { + typedef boost::intrusive::list< + Buffer, + boost::intrusive::member_hook< + Buffer, + boost::intrusive::list_member_hook<>, + &Buffer::lru_item> > buffer_lru_list_t; typedef boost::intrusive::list< Onode, boost::intrusive::member_hook< @@ -415,21 +413,43 @@ public: boost::intrusive::list_member_hook<>, &Onode::lru_item> > onode_lru_list_t; - std::mutex lock; + std::mutex lock; ///< protect lru and other structures + buffer_lru_list_t buffer_lru; + uint64_t buffer_size = 0; + onode_lru_list_t onode_lru; + + void _touch_onode(OnodeRef& o); + + void _touch_buffer(Buffer *b) { + auto p = buffer_lru.iterator_to(*b); + buffer_lru.erase(p); + buffer_lru.push_front(*b); + _audit_lru(); + } + + void trim(uint64_t onode_max, uint64_t buffer_max); + +#ifdef DEBUG_CACHE + void _audit_lru(); +#else + void _audit_lru() { /* no-op */ } +#endif + }; + + struct OnodeSpace { + Cache *cache; ceph::unordered_map onode_map; ///< forward lookups - onode_lru_list_t lru; ///< lru - size_t max_size; - OnodeHashLRU(size_t s) : max_size(s) {} + OnodeSpace(Cache *c) : cache(c) {} + ~OnodeSpace() { + clear(); + } void add(const ghobject_t& oid, OnodeRef o); - void _touch(OnodeRef o); OnodeRef lookup(const ghobject_t& o); void rename(OnodeRef& o, const ghobject_t& old_oid, const ghobject_t& new_oid); void clear(); bool get_next(const ghobject_t& after, pair *next); - int trim(int max=-1); - int _trim(int max); }; struct Collection : public CollectionImpl { @@ -444,8 +464,8 @@ public: // cache onodes on a per-collection basis to avoid lock // contention. - OnodeHashLRU onode_map; - BufferCache buffer_cache; + OnodeSpace onode_map; + Cache cache; OnodeRef get_onode(const ghobject_t& oid, bool create); BnodeRef get_bnode(uint32_t hash); @@ -564,8 +584,6 @@ public: IOContext ioc; - CollectionRef first_collection; ///< first referenced collection - uint64_t seq = 0; utime_t start; @@ -796,6 +814,8 @@ private: RWLock coll_lock; ///< rwlock to protect coll_map ceph::unordered_map coll_map; + Cache cache; + std::mutex nid_lock; uint64_t nid_last; uint64_t nid_max;