From: Sage Weil Date: Mon, 14 Dec 2015 21:57:10 +0000 (-0500) Subject: os/bluestore: Enode infrastructure X-Git-Tag: v10.0.3~154^2~72 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=64b4e2f6129b698bb13071406312d6d6f19d1d79;p=ceph.git os/bluestore: Enode infrastructure Enodes will track extent ref counts for any extent that is marked shared. There will be an enode for any unique hash value that has any refs. We will keep in-memory copies of only those Enodes that are referenced by in-memory Onodes, and only if the enode is requested (e.g., the enode won't be loaded as a result of an object read because we never need to call get_enode.). Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index c832716936f4..0302b7d04f97 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -273,6 +273,22 @@ static void get_coll_key_range(const coll_t& cid, int bits, } } +static bool is_enode_key(const string& key) +{ + if (key.size() == 2 + 8 + 4) + return true; + return false; +} + +static void get_enode_key(shard_id_t shard, int64_t pool, uint32_t hash, + string *key) +{ + key->clear(); + _key_encode_shard(shard, key); + _key_encode_u64(pool + 0x8000000000000000ull, key); + _key_encode_u32(hobject_t::_reverse_bits(hash), key); +} + static int get_key_object(const string& key, ghobject_t *oid); static void get_object_key(const ghobject_t& oid, string *key) @@ -433,6 +449,20 @@ static void get_wal_key(uint64_t seq, string *out) _key_encode_u64(seq, out); } +// Enode + +#undef dout_prefix +#define dout_prefix *_dout << "bluestore.enode(" << this << ") " + +void BlueStore::Enode::put() +{ + int final = nref.dec(); + if (final == 0) { + dout(20) << __func__ << " removing self from set " << enode_set << dendl; + enode_set->uset.erase(*this); + delete this; + } +} // Onode @@ -606,10 +636,48 @@ BlueStore::Collection::Collection(BlueStore *ns, coll_t c) : store(ns), cid(c), lock("BlueStore::Collection::lock"), - onode_map() + onode_map(), + enode_set(g_conf->bluestore_onode_map_size) { } +BlueStore::EnodeRef BlueStore::Collection::get_enode( + uint32_t hash + ) +{ + Enode dummy(hash, string(), NULL); + auto p = enode_set.uset.find(dummy); + if (p == enode_set.uset.end()) { + spg_t pgid; + if (!cid.is_pg(&pgid)) + pgid = spg_t(); // meta + string key; + get_enode_key(pgid.shard, pgid.pool(), hash, &key); + EnodeRef e = new Enode(hash, key, &enode_set); + dout(10) << __func__ << " hash " << std::hex << hash << std::dec + << " created " << e << dendl; + + bufferlist v; + int r = store->db->get(PREFIX_OBJ, key, &v); + if (r >= 0) { + assert(v.length() > 0); + bufferlist::iterator p = v.begin(); + ::decode(e->ref_map, p); + dout(10) << __func__ << " hash " << std::hex << hash << std::dec + << " loaded ref_map " << e->ref_map << dendl; + } else { + dout(10) << __func__ << " hash " <& v) +{ + int errors = 0; + interval_set span; + bluestore_extent_ref_map_t ref_map; + dout(10) << __func__ << " hash " << enode->hash << " v " << v << dendl; + for (auto& p : v) { + interval_set t, i; + t.insert(p.offset, p.length); + i.intersection_of(t, span); + t.subtract(i); + dout(20) << __func__ << " extent " << p << " t " << t << " i " << i + << dendl; + for (interval_set::iterator q = t.begin(); q != t.end(); ++q) { + ref_map.add(q.get_start(), q.get_len(), 1); + } + for (interval_set::iterator q = i.begin(); q != i.end(); ++q) { + ref_map.get(q.get_start(), q.get_len()); + } + span.insert(t); + } + if (enode->ref_map != ref_map) { + derr << " hash " << enode->hash << " ref_map " << enode->ref_map + << " != expected " << ref_map << dendl; + ++errors; + } + return errors; +} + int BlueStore::fsck() { dout(1) << __func__ << dendl; @@ -1682,6 +1781,8 @@ int BlueStore::fsck() set used_omap_head; interval_set used_blocks; KeyValueDB::Iterator it; + EnodeRef enode; + vector hash_shared; int r = _open_path(); if (r < 0) @@ -1754,6 +1855,12 @@ int BlueStore::fsck() ++errors; break; } + if (enode && enode->hash != o->oid.hobj.get_hash()) { + if (enode) + errors += _verify_enode_shared(enode, hash_shared); + enode = c->get_enode(o->oid.hobj.get_hash()); + hash_shared.clear(); + } if (o->onode.nid) { if (used_nids.count(o->onode.nid)) { derr << " " << oid << " nid " << o->onode.nid << " already in use" @@ -1765,6 +1872,8 @@ int BlueStore::fsck() } // blocks for (auto& b : o->onode.block_map) { + if (b.second.has_flag(bluestore_extent_t::FLAG_SHARED)) + hash_shared.push_back(b.second); if (used_blocks.intersects(b.second.offset, b.second.length)) { derr << " " << oid << " extent " << b.first << ": " << b.second << " already allocated" << dendl; @@ -2595,6 +2704,13 @@ int BlueStore::collection_list( } break; } + if (is_enode_key(it->key())) { + dout(20) << __func__ << " key " + << pretty_binary_string(it->key()) + << " (enode, skipping)" << dendl; + it->next(); + continue; + } dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; ghobject_t oid; int r = get_key_object(it->key(), &oid); @@ -3129,13 +3245,30 @@ int BlueStore::_txc_finalize(OpSequencer *osr, TransContext *txc) ++p) { bufferlist bl; ::encode((*p)->onode, bl); - dout(20) << " onode size is " << bl.length() << dendl; + dout(20) << " onode " << (*p)->oid << " is " << bl.length() << dendl; txc->t->set(PREFIX_OBJ, (*p)->key, bl); Mutex::Locker l((*p)->flush_lock); (*p)->flush_txns.insert(txc); } + // finalize enodes + for (set::iterator p = txc->enodes.begin(); + p != txc->enodes.end(); + ++p) { + if ((*p)->ref_map.empty()) { + dout(20) << " enode " << std::hex << (*p)->hash << std::dec + << " ref_map is empty" << dendl; + txc->t->rmkey(PREFIX_OBJ, (*p)->key); + } else { + bufferlist bl; + ::encode((*p)->ref_map, bl); + dout(20) << " enode " << std::hex << (*p)->hash << std::dec + << " ref_map is " << bl.length() << dendl; + txc->t->set(PREFIX_OBJ, (*p)->key, bl); + } + } + // journal wal items if (txc->wal_txn) { txc->wal_txn->seq = wal_seq.inc(); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index e13858e07636..05cc5e970f6a 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -19,6 +19,10 @@ #include +#include +#include +#include + #include "include/assert.h" #include "include/unordered_map.h" #include "include/memory.h" @@ -32,8 +36,6 @@ #include "bluestore_types.h" #include "BlockDevice.h" -#include "boost/intrusive/list.hpp" - class Allocator; class FreelistManager; class BlueFS; @@ -45,10 +47,61 @@ public: class TransContext; - /// an extent map, shared by a group of objects (clones) - struct ObjectGroup { - atomic_t nref; ///< reference count - bluestore_extent_ref_map_t m; + /// an in-memory extent-map, shared by a group of objects (w/ same hash value) + struct EnodeSet; + + struct Enode : public boost::intrusive::unordered_set_base_hook<> { + atomic_t nref; ///< reference count + uint32_t hash; + string key; ///< key under PREFIX_OBJ where we are stored + EnodeSet *enode_set; ///< reference to the containing set + + bluestore_extent_ref_map_t ref_map; + + boost::intrusive::unordered_set_member_hook<> map_item; + + Enode(uint32_t h, const string& k, EnodeSet *s) + : nref(0), + hash(h), + key(k), + enode_set(s) {} + + void get() { + nref.inc(); + } + void put(); + + friend void intrusive_ptr_add_ref(Enode *e) { e->get(); } + friend void intrusive_ptr_release(Enode *e) { e->put(); } + + friend bool operator==(const Enode &l, const Enode &r) { + return l.hash == r.hash; + } + friend std::size_t hash_value(const Enode &e) { + return e.hash; + } + }; + typedef boost::intrusive_ptr EnodeRef; + + /// hash of Enodes, by (object) hash value + struct EnodeSet { + typedef boost::intrusive::unordered_set::bucket_type bucket_type; + typedef boost::intrusive::unordered_set::bucket_traits bucket_traits; + + unsigned num_buckets; + vector buckets; + + boost::intrusive::unordered_set uset; + + EnodeSet(unsigned n) + : num_buckets(n), + buckets(n), + uset(bucket_traits(buckets.data(), num_buckets)) { + assert(n > 0); + } + ~EnodeSet() { + assert(uset.empty()); + } }; /// an in-memory object @@ -59,6 +112,8 @@ public: string key; ///< key under PREFIX_OBJ where we are stored boost::intrusive::list_member_hook<> lru_item; + EnodeRef enode; ///< ref to Enode [optional] + bluestore_onode_t onode; ///< metadata stored as value in kv store bool dirty; // ??? bool exists; @@ -121,7 +176,10 @@ public: // contention. OnodeHashLRU onode_map; + EnodeSet enode_set; ///< open Enodes + OnodeRef get_onode(const ghobject_t& oid, bool create); + EnodeRef get_enode(uint32_t hash); bool contains(const ghobject_t& oid) { if (cid.is_meta()) @@ -204,6 +262,7 @@ public: uint64_t ops, bytes; set onodes; ///< these onodes need to be updated/written + set enodes; ///< these enodes need to be updated/written KeyValueDB::Transaction t; ///< then we will commit this Context *oncommit; ///< signal on commit Context *onreadable; ///< signal on readable @@ -245,6 +304,9 @@ public: void write_onode(OnodeRef &o) { onodes.insert(o); } + void write_enode(EnodeRef &e) { + enodes.insert(e); + } }; class OpSequencer : public Sequencer_impl { @@ -517,6 +579,9 @@ private: int _do_wal_op(bluestore_wal_op_t& wo, IOContext *ioc); int _wal_replay(); + // for fsck + int _verify_enode_shared(EnodeRef enode, vector& v); + public: BlueStore(CephContext *cct, const string& path); ~BlueStore();