}
}
+static bool is_enode_key(const string& key)
+{
+ if (key.size() == 2 + 8 + 4)
+ return true;
+ return false;
+}
+
+static void get_enode_key(shard_id_t shard, int64_t pool, uint32_t hash,
+ string *key)
+{
+ key->clear();
+ _key_encode_shard(shard, key);
+ _key_encode_u64(pool + 0x8000000000000000ull, key);
+ _key_encode_u32(hobject_t::_reverse_bits(hash), key);
+}
+
static int get_key_object(const string& key, ghobject_t *oid);
static void get_object_key(const ghobject_t& oid, string *key)
_key_encode_u64(seq, out);
}
+// Enode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.enode(" << this << ") "
+
+void BlueStore::Enode::put()
+{
+ int final = nref.dec();
+ if (final == 0) {
+ dout(20) << __func__ << " removing self from set " << enode_set << dendl;
+ enode_set->uset.erase(*this);
+ delete this;
+ }
+}
// Onode
: store(ns),
cid(c),
lock("BlueStore::Collection::lock"),
- onode_map()
+ onode_map(),
+ enode_set(g_conf->bluestore_onode_map_size)
{
}
+BlueStore::EnodeRef BlueStore::Collection::get_enode(
+ uint32_t hash
+ )
+{
+ Enode dummy(hash, string(), NULL);
+ auto p = enode_set.uset.find(dummy);
+ if (p == enode_set.uset.end()) {
+ spg_t pgid;
+ if (!cid.is_pg(&pgid))
+ pgid = spg_t(); // meta
+ string key;
+ get_enode_key(pgid.shard, pgid.pool(), hash, &key);
+ EnodeRef e = new Enode(hash, key, &enode_set);
+ dout(10) << __func__ << " hash " << std::hex << hash << std::dec
+ << " created " << e << dendl;
+
+ bufferlist v;
+ int r = store->db->get(PREFIX_OBJ, key, &v);
+ if (r >= 0) {
+ assert(v.length() > 0);
+ bufferlist::iterator p = v.begin();
+ ::decode(e->ref_map, p);
+ dout(10) << __func__ << " hash " << std::hex << hash << std::dec
+ << " loaded ref_map " << e->ref_map << dendl;
+ } else {
+ dout(10) << __func__ << " hash " <<std::hex << hash << std::dec
+ << " missed, new ref_map" << dendl;
+ }
+ enode_set.uset.insert(*e);
+ return e;
+ } else {
+ dout(10) << __func__ << " hash " << std::hex << hash << std::dec
+ << " had " << &*p << dendl;
+ return &*p;
+ }
+}
+
BlueStore::OnodeRef BlueStore::Collection::get_onode(
const ghobject_t& oid,
bool create)
return 0;
}
+int BlueStore::_verify_enode_shared(
+ EnodeRef enode,
+ vector<bluestore_extent_t>& v)
+{
+ int errors = 0;
+ interval_set<uint64_t> span;
+ bluestore_extent_ref_map_t ref_map;
+ dout(10) << __func__ << " hash " << enode->hash << " v " << v << dendl;
+ for (auto& p : v) {
+ interval_set<uint64_t> t, i;
+ t.insert(p.offset, p.length);
+ i.intersection_of(t, span);
+ t.subtract(i);
+ dout(20) << __func__ << " extent " << p << " t " << t << " i " << i
+ << dendl;
+ for (interval_set<uint64_t>::iterator q = t.begin(); q != t.end(); ++q) {
+ ref_map.add(q.get_start(), q.get_len(), 1);
+ }
+ for (interval_set<uint64_t>::iterator q = i.begin(); q != i.end(); ++q) {
+ ref_map.get(q.get_start(), q.get_len());
+ }
+ span.insert(t);
+ }
+ if (enode->ref_map != ref_map) {
+ derr << " hash " << enode->hash << " ref_map " << enode->ref_map
+ << " != expected " << ref_map << dendl;
+ ++errors;
+ }
+ return errors;
+}
+
int BlueStore::fsck()
{
dout(1) << __func__ << dendl;
set<uint64_t> used_omap_head;
interval_set<uint64_t> used_blocks;
KeyValueDB::Iterator it;
+ EnodeRef enode;
+ vector<bluestore_extent_t> hash_shared;
int r = _open_path();
if (r < 0)
++errors;
break;
}
+ if (enode && enode->hash != o->oid.hobj.get_hash()) {
+ if (enode)
+ errors += _verify_enode_shared(enode, hash_shared);
+ enode = c->get_enode(o->oid.hobj.get_hash());
+ hash_shared.clear();
+ }
if (o->onode.nid) {
if (used_nids.count(o->onode.nid)) {
derr << " " << oid << " nid " << o->onode.nid << " already in use"
}
// blocks
for (auto& b : o->onode.block_map) {
+ if (b.second.has_flag(bluestore_extent_t::FLAG_SHARED))
+ hash_shared.push_back(b.second);
if (used_blocks.intersects(b.second.offset, b.second.length)) {
derr << " " << oid << " extent " << b.first << ": " << b.second
<< " already allocated" << dendl;
}
break;
}
+ if (is_enode_key(it->key())) {
+ dout(20) << __func__ << " key "
+ << pretty_binary_string(it->key())
+ << " (enode, skipping)" << dendl;
+ it->next();
+ continue;
+ }
dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
ghobject_t oid;
int r = get_key_object(it->key(), &oid);
++p) {
bufferlist bl;
::encode((*p)->onode, bl);
- dout(20) << " onode size is " << bl.length() << dendl;
+ dout(20) << " onode " << (*p)->oid << " is " << bl.length() << dendl;
txc->t->set(PREFIX_OBJ, (*p)->key, bl);
Mutex::Locker l((*p)->flush_lock);
(*p)->flush_txns.insert(txc);
}
+ // finalize enodes
+ for (set<EnodeRef>::iterator p = txc->enodes.begin();
+ p != txc->enodes.end();
+ ++p) {
+ if ((*p)->ref_map.empty()) {
+ dout(20) << " enode " << std::hex << (*p)->hash << std::dec
+ << " ref_map is empty" << dendl;
+ txc->t->rmkey(PREFIX_OBJ, (*p)->key);
+ } else {
+ bufferlist bl;
+ ::encode((*p)->ref_map, bl);
+ dout(20) << " enode " << std::hex << (*p)->hash << std::dec
+ << " ref_map is " << bl.length() << dendl;
+ txc->t->set(PREFIX_OBJ, (*p)->key, bl);
+ }
+ }
+
// journal wal items
if (txc->wal_txn) {
txc->wal_txn->seq = wal_seq.inc();
#include <unistd.h>
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/unordered_set.hpp>
+#include <boost/functional/hash.hpp>
+
#include "include/assert.h"
#include "include/unordered_map.h"
#include "include/memory.h"
#include "bluestore_types.h"
#include "BlockDevice.h"
-#include "boost/intrusive/list.hpp"
-
class Allocator;
class FreelistManager;
class BlueFS;
class TransContext;
- /// an extent map, shared by a group of objects (clones)
- struct ObjectGroup {
- atomic_t nref; ///< reference count
- bluestore_extent_ref_map_t m;
+ /// an in-memory extent-map, shared by a group of objects (w/ same hash value)
+ struct EnodeSet;
+
+ struct Enode : public boost::intrusive::unordered_set_base_hook<> {
+ atomic_t nref; ///< reference count
+ uint32_t hash;
+ string key; ///< key under PREFIX_OBJ where we are stored
+ EnodeSet *enode_set; ///< reference to the containing set
+
+ bluestore_extent_ref_map_t ref_map;
+
+ boost::intrusive::unordered_set_member_hook<> map_item;
+
+ Enode(uint32_t h, const string& k, EnodeSet *s)
+ : nref(0),
+ hash(h),
+ key(k),
+ enode_set(s) {}
+
+ void get() {
+ nref.inc();
+ }
+ void put();
+
+ friend void intrusive_ptr_add_ref(Enode *e) { e->get(); }
+ friend void intrusive_ptr_release(Enode *e) { e->put(); }
+
+ friend bool operator==(const Enode &l, const Enode &r) {
+ return l.hash == r.hash;
+ }
+ friend std::size_t hash_value(const Enode &e) {
+ return e.hash;
+ }
+ };
+ typedef boost::intrusive_ptr<Enode> EnodeRef;
+
+ /// hash of Enodes, by (object) hash value
+ struct EnodeSet {
+ typedef boost::intrusive::unordered_set<Enode>::bucket_type bucket_type;
+ typedef boost::intrusive::unordered_set<Enode>::bucket_traits bucket_traits;
+
+ unsigned num_buckets;
+ vector<bucket_type> buckets;
+
+ boost::intrusive::unordered_set<Enode> uset;
+
+ EnodeSet(unsigned n)
+ : num_buckets(n),
+ buckets(n),
+ uset(bucket_traits(buckets.data(), num_buckets)) {
+ assert(n > 0);
+ }
+ ~EnodeSet() {
+ assert(uset.empty());
+ }
};
/// an in-memory object
string key; ///< key under PREFIX_OBJ where we are stored
boost::intrusive::list_member_hook<> lru_item;
+ EnodeRef enode; ///< ref to Enode [optional]
+
bluestore_onode_t onode; ///< metadata stored as value in kv store
bool dirty; // ???
bool exists;
// contention.
OnodeHashLRU onode_map;
+ EnodeSet enode_set; ///< open Enodes
+
OnodeRef get_onode(const ghobject_t& oid, bool create);
+ EnodeRef get_enode(uint32_t hash);
bool contains(const ghobject_t& oid) {
if (cid.is_meta())
uint64_t ops, bytes;
set<OnodeRef> onodes; ///< these onodes need to be updated/written
+ set<EnodeRef> enodes; ///< these enodes need to be updated/written
KeyValueDB::Transaction t; ///< then we will commit this
Context *oncommit; ///< signal on commit
Context *onreadable; ///< signal on readable
void write_onode(OnodeRef &o) {
onodes.insert(o);
}
+ void write_enode(EnodeRef &e) {
+ enodes.insert(e);
+ }
};
class OpSequencer : public Sequencer_impl {
int _do_wal_op(bluestore_wal_op_t& wo, IOContext *ioc);
int _wal_replay();
+ // for fsck
+ int _verify_enode_shared(EnodeRef enode, vector<bluestore_extent_t>& v);
+
public:
BlueStore(CephContext *cct, const string& path);
~BlueStore();