From: Sage Weil Date: Thu, 2 Jun 2016 20:33:01 +0000 (-0400) Subject: os/bluestore: shard the cache X-Git-Tag: v11.0.0~302^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bf87e3a2216746569e3c0b27030f73611c15fae5;p=ceph.git os/bluestore: shard the cache Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 74dd92312e1..81ce6da64e6 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -832,13 +832,14 @@ void BlueStore::Onode::flush() #undef dout_prefix #define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << ") " -BlueStore::Collection::Collection(BlueStore *ns, coll_t c) +BlueStore::Collection::Collection(BlueStore *ns, Cache *cs, coll_t c) : store(ns), + cache(cs), cid(c), lock("BlueStore::Collection::lock", true, false), exists(true), bnode_set(MAX(16, g_conf->bluestore_onode_cache_size / 128)), - onode_map(&ns->cache) + onode_map(cs) { } @@ -915,11 +916,11 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode( return OnodeRef(); // new - on = new Onode(&onode_map, oid, key, &cache); + on = new Onode(&onode_map, oid, key, cache); } else { // loaded assert(r >=0); - on = new Onode(&onode_map, oid, key, &cache); + on = new Onode(&onode_map, oid, key, cache); on->exists = true; bufferlist::iterator p = v.begin(); ::decode(on->onode, p); @@ -985,6 +986,7 @@ BlueStore::BlueStore(CephContext *cct, const string& path) { _init_logger(); g_ceph_context->_conf->add_observer(this); + set_cache_shards(1); } BlueStore::~BlueStore() @@ -995,6 +997,10 @@ BlueStore::~BlueStore() assert(db == NULL); assert(bluefs == NULL); assert(fsid_fd < 0); + for (auto i : cache_shards) { + delete i; + } + cache_shards.clear(); } const char **BlueStore::get_tracked_conf_keys() const @@ -1956,7 +1962,11 @@ int BlueStore::_open_collections(int *errors) it->next()) { coll_t cid; if (cid.parse(it->key())) { - CollectionRef c(new Collection(this, cid)); + CollectionRef c( + new Collection( + this, + cache_shards[cid.hash_to_shard(cache_shards.size())], + cid)); bufferlist bl = it->value(); bufferlist::iterator p = bl.begin(); try { @@ -2213,6 +2223,17 @@ int BlueStore::mkfs() return r; } +void BlueStore::set_cache_shards(unsigned num) +{ + dout(10) << __func__ << " " << num << dendl; + size_t old = cache_shards.size(); + assert(num >= old); + cache_shards.resize(num); + for (unsigned i = old; i < num; ++i) { + cache_shards[i] = new Cache; + } +} + int BlueStore::mount() { dout(1) << __func__ << " path " << path << dendl; @@ -4454,6 +4475,7 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) { std::lock_guard l(osr->qlock); dout(20) << __func__ << " osr " << osr << dendl; + CollectionRef c; while (!osr->q.empty()) { TransContext *txc = &osr->q.front(); dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() @@ -4462,6 +4484,10 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) break; } + if (!c && txc->first_collection) { + c = txc->first_collection; + } + osr->q.pop_front(); txc->log_state_latency(logger, l_bluestore_state_done_lat); delete txc; @@ -4469,9 +4495,11 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) if (osr->q.empty()) dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; } - - cache.trim(g_conf->bluestore_onode_cache_size, - g_conf->bluestore_buffer_cache_size); + if (c) { + c->cache->trim( + g_conf->bluestore_onode_cache_size, + g_conf->bluestore_buffer_cache_size); + } } void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) @@ -4861,6 +4889,10 @@ void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t) for (vector::iterator p = i.colls.begin(); p != i.colls.end(); ++p, ++j) { cvec[j] = _get_collection(*p); + + // note first collection we reference + if (!j && !txc->first_collection) + txc->first_collection = cvec[j]; } vector ovec(i.objects.size()); @@ -6551,7 +6583,11 @@ int BlueStore::_create_collection( r = -EEXIST; goto out; } - c->reset(new Collection(this, cid)); + c->reset( + new Collection( + this, + cache_shards[cid.hash_to_shard(cache_shards.size())], + cid)); (*c)->cnode.bits = bits; coll_map[cid] = *c; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a11561b4fe0..98da0d83653 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -452,8 +452,11 @@ public: bool get_next(const ghobject_t& after, pair *next); }; + struct Cache; + struct Collection : public CollectionImpl { BlueStore *store; + Cache *cache; ///< our cache shard coll_t cid; bluestore_cnode_t cnode; RWLock lock; @@ -465,7 +468,6 @@ public: // cache onodes on a per-collection basis to avoid lock // contention. OnodeSpace onode_map; - Cache cache; OnodeRef get_onode(const ghobject_t& oid, bool create); BnodeRef get_bnode(uint32_t hash); @@ -491,7 +493,7 @@ public: return false; } - Collection(BlueStore *ns, coll_t c); + Collection(BlueStore *ns, Cache *ca, coll_t c); }; typedef boost::intrusive_ptr CollectionRef; @@ -584,6 +586,8 @@ public: IOContext ioc; + CollectionRef first_collection; ///< first referenced collection + uint64_t seq = 0; utime_t start; @@ -814,7 +818,7 @@ private: RWLock coll_lock; ///< rwlock to protect coll_map ceph::unordered_map coll_map; - Cache cache; + vector cache_shards; std::mutex nid_lock; uint64_t nid_last; @@ -985,6 +989,8 @@ public: int fsck() override; + void set_cache_shards(unsigned num) override; + int validate_hobject_key(const hobject_t &obj) const override { return 0; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index fa33c9f0efb..28c56c911bb 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -674,6 +674,12 @@ public: return o; } + unsigned hash_to_shard(unsigned num_shards) const { + if (type == TYPE_PG) + return pgid.hash_to_shard(num_shards); + return 0; // whatever. + } + void dump(Formatter *f) const; static void generate_test_instances(list& o); };