_key_encode_u64(seq, out);
}
-// BufferCache
-#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.BufferCache(" << this << ") "
+// Buffer
ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
{
return out << ")";
}
-void BlueStore::BufferCache::trim(uint64_t keep)
+
+// Cache
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.Cache(" << this << ") "
+
+void BlueStore::Cache::_touch_onode(OnodeRef& o)
+{
+ auto p = onode_lru.iterator_to(*o);
+ onode_lru.erase(p);
+ onode_lru.push_front(*o);
+}
+
+void BlueStore::Cache::trim(uint64_t onode_max, uint64_t buffer_max)
{
- audit_lru();
- auto i = lru.end();
- if (size) {
- assert(i != lru.begin());
+ std::lock_guard<std::mutex> l(lock);
+
+ dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
+ << " buffers " << buffer_size << " / " << buffer_max
+ << dendl;
+
+ _audit_lru();
+
+ // buffers
+ auto i = buffer_lru.end();
+ if (buffer_size) {
+ assert(i != buffer_lru.begin());
--i;
}
- while (size > keep) {
+ while (buffer_size > buffer_max) {
Buffer *b = &*i;
if (b->is_clean()) {
auto p = b->space->buffer_map.find(b->offset);
- if (i != lru.begin()) {
+ if (i != buffer_lru.begin()) {
--i;
}
dout(20) << __func__ << " rm " << *b << dendl;
b->space->_rm_buffer(p);
} else {
- if (i != lru.begin()) {
+ if (i != buffer_lru.begin()) {
--i;
continue;
} else {
}
}
}
+
+ // onodes
+ int num = onode_lru.size() - onode_max;
+ if (num <= 0)
+ return; // don't even try
+
+ auto p = onode_lru.end();
+ if (num)
+ --p;
+ while (num > 0) {
+ Onode *o = &*p;
+ int refs = o->nref.load();
+ if (refs > 1) {
+ dout(20) << __func__ << " " << o->oid << " has " << refs
+ << " refs; stopping with " << num << " left to trim" << dendl;
+ break;
+ }
+ dout(30) << __func__ << " trim " << o->oid << dendl;
+ if (p != onode_lru.begin()) {
+ onode_lru.erase(p--);
+ } else {
+ onode_lru.erase(p);
+ assert(num == 1);
+ }
+ o->get(); // paranoia
+ o->space->onode_map.erase(o->oid);
+ o->bc._clear(); // clear buffers, too
+ o->put();
+ --num;
+ }
}
#ifdef DEBUG_CACHE
-void BlueStore::BufferCache::audit_lru()
+void BlueStore::Cache::_audit_lru()
{
if (true) {
uint64_t s = 0;
- for (auto i = lru.begin(); i != lru.end(); ++i) {
+ for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
s += i->length;
}
- if (s != size) {
- derr << __func__ << " size " << size << " actual " << s << dendl;
- for (auto i = lru.begin(); i != lru.end(); ++i) {
+ if (s != buffer_size) {
+ derr << __func__ << " buffer_size " << buffer_size << " actual " << s
+ << dendl;
+ for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
derr << __func__ << " " << *i << dendl;
}
- assert(s == size);
+ assert(s == buffer_size);
}
- dout(20) << __func__ << " size " << size << " ok" << dendl;
+ dout(20) << __func__ << " buffer_size " << buffer_size << " ok" << dendl;
}
}
#endif
// BufferSpace
#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << ") "
+#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
+
+void BlueStore::BufferSpace::_clear()
+{
+ // note: we already hold cache->lock
+ dout(10) << __func__ << dendl;
+ while (!buffer_map.empty()) {
+ _rm_buffer(buffer_map.begin());
+ }
+}
-void BlueStore::BufferSpace::discard(uint64_t offset, uint64_t length)
+void BlueStore::BufferSpace::_discard(uint64_t offset, uint64_t length)
{
- cache->audit_lru();
+ std::lock_guard<std::mutex> l(cache->lock);
+ cache->_audit_lru();
auto i = _data_lower_bound(offset);
uint64_t end = offset + length;
while (i != buffer_map.end()) {
} else {
_add_buffer(new Buffer(this, b->state, b->seq, end, tail));
}
- cache->size -= b->length - front;
+ cache->buffer_size -= b->length - front;
b->truncate(front);
- cache->audit_lru();
+ cache->_audit_lru();
return;
} else {
// drop tail
- cache->size -= b->length - front;
+ cache->buffer_size -= b->length - front;
b->truncate(front);
++i;
continue;
_add_buffer(new Buffer(this, b->state, b->seq, end, keep));
_rm_buffer(i);
}
- cache->audit_lru();
+ cache->_audit_lru();
return;
}
}
BlueStore::ready_regions_t& res,
interval_set<uint64_t>& res_intervals)
{
+ std::lock_guard<std::mutex> l(cache->lock);
res.clear();
uint64_t end = offset + length;
for (auto i = _data_lower_bound(offset);
res_intervals.insert(offset, l);
offset += l;
length -= l;
+ cache->_touch_buffer(b);
continue;
}
if (b->offset > offset) {
offset += b->length;
length -= b->length;
}
+ cache->_touch_buffer(b);
}
}
}
void BlueStore::BufferSpace::finish_write(uint64_t seq)
{
+ std::lock_guard<std::mutex> l(cache->lock);
auto i = writing.begin();
while (i != writing.end()) {
Buffer *b = &*i;
++i;
}
}
- cache->audit_lru();
-}
-
-
-// Bnode
-
-#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.bnode(" << this << ") "
-
-void BlueStore::Bnode::put()
-{
- if (--nref == 0) {
- dout(20) << __func__ << " removing self from set " << bnode_set << dendl;
- bnode_set->uset.erase(*this);
- delete this;
- }
-}
-
-// Onode
-
-#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.onode(" << this << ") "
-
-void BlueStore::Onode::flush()
-{
- std::unique_lock<std::mutex> l(flush_lock);
- dout(20) << __func__ << " " << flush_txns << dendl;
- while (!flush_txns.empty())
- flush_cond.wait(l);
- dout(20) << __func__ << " done" << dendl;
+ cache->_audit_lru();
}
-// OnodeHashLRU
+// OnodeSpace
#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.lru(" << this << ") "
-
-void BlueStore::OnodeHashLRU::_touch(OnodeRef o)
-{
- auto p = lru.iterator_to(*o);
- lru.erase(p);
- lru.push_front(*o);
-}
+#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
-void BlueStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
+void BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
{
- std::lock_guard<std::mutex> l(lock);
+ std::lock_guard<std::mutex> l(cache->lock);
dout(30) << __func__ << " " << oid << " " << o << dendl;
assert(onode_map.count(oid) == 0);
onode_map[oid] = o;
- lru.push_front(*o);
- _trim(max_size);
+ cache->onode_lru.push_front(*o);
}
-BlueStore::OnodeRef BlueStore::OnodeHashLRU::lookup(const ghobject_t& oid)
+BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
{
- std::lock_guard<std::mutex> l(lock);
+ std::lock_guard<std::mutex> l(cache->lock);
dout(30) << __func__ << dendl;
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
if (p == onode_map.end()) {
return OnodeRef();
}
dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
- _touch(p->second);
+ cache->_touch_onode(p->second);
return p->second;
}
-void BlueStore::OnodeHashLRU::clear()
+void BlueStore::OnodeSpace::clear()
{
- std::lock_guard<std::mutex> l(lock);
+ std::lock_guard<std::mutex> l(cache->lock);
dout(10) << __func__ << dendl;
- lru.clear();
+ for (auto &p : onode_map) {
+ auto q = cache->onode_lru.iterator_to(*p.second);
+ cache->onode_lru.erase(q);
+
+ // clear buffers too, while we have cache->lock
+ p.second->bc._clear();
+ }
onode_map.clear();
}
-void BlueStore::OnodeHashLRU::rename(OnodeRef& oldo,
+void BlueStore::OnodeSpace::rename(OnodeRef& oldo,
const ghobject_t& old_oid,
const ghobject_t& new_oid)
{
- std::lock_guard<std::mutex> l(lock);
+ std::lock_guard<std::mutex> l(cache->lock);
dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
po = onode_map.find(old_oid);
assert(po != onode_map.end());
if (pn != onode_map.end()) {
dout(30) << __func__ << " removing target " << pn->second << dendl;
- auto p = lru.iterator_to(*pn->second);
- lru.erase(p);
+ auto p = cache->onode_lru.iterator_to(*pn->second);
+ cache->onode_lru.erase(p);
onode_map.erase(pn);
}
OnodeRef o = po->second;
// install a non-existent onode at old location
- oldo.reset(new Onode(old_oid, o->key, o->bc.cache));
+ oldo.reset(new Onode(this, old_oid, o->key, o->bc.cache));
po->second = oldo;
- lru.push_back(*po->second);
+ cache->onode_lru.push_back(*po->second);
// add at new position and fix oid, key
onode_map.insert(make_pair(new_oid, o));
- _touch(o);
+ cache->_touch_onode(o);
o->oid = new_oid;
get_object_key(new_oid, &o->key);
}
-bool BlueStore::OnodeHashLRU::get_next(
+bool BlueStore::OnodeSpace::get_next(
const ghobject_t& after,
pair<ghobject_t,OnodeRef> *next)
{
- std::lock_guard<std::mutex> l(lock);
+ std::lock_guard<std::mutex> l(cache->lock);
dout(20) << __func__ << " after " << after << dendl;
if (after == ghobject_t()) {
- if (lru.empty()) {
+ if (cache->onode_lru.empty()) {
return false;
}
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin();
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after);
assert(p != onode_map.end()); // for now
- auto pi = lru.iterator_to(*p->second);
+ auto pi = cache->onode_lru.iterator_to(*p->second);
++pi;
- if (pi == lru.end()) {
+ if (pi == cache->onode_lru.end()) {
return false;
}
next->first = pi->oid;
return true;
}
-int BlueStore::OnodeHashLRU::trim(int max)
+
+// Bnode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.bnode(" << this << ") "
+
+void BlueStore::Bnode::put()
{
- std::lock_guard<std::mutex> l(lock);
- if (max < 0) {
- max = max_size;
+ if (--nref == 0) {
+ dout(20) << __func__ << " removing self from set " << bnode_set << dendl;
+ bnode_set->uset.erase(*this);
+ delete this;
}
- return _trim(max);
}
-int BlueStore::OnodeHashLRU::_trim(int max)
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.onode(" << this << ") "
+
+void BlueStore::Onode::flush()
{
- dout(20) << __func__ << " max " << max << " size " << onode_map.size() << dendl;
- int trimmed = 0;
- int num = onode_map.size() - max;
- if (onode_map.size() == 0 || num <= 0)
- return 0; // don't even try
-
- auto p = lru.end();
- if (num)
- --p;
- while (num > 0) {
- Onode *o = &*p;
- int refs = o->nref.load();
- if (refs > 1) {
- dout(20) << __func__ << " " << o->oid << " has " << refs
- << " refs; stopping with " << num << " left to trim" << dendl;
- break;
- }
- dout(30) << __func__ << " trim " << o->oid << dendl;
- if (p != lru.begin()) {
- lru.erase(p--);
- } else {
- lru.erase(p);
- assert(num == 1);
- }
- o->get(); // paranoia
- onode_map.erase(o->oid);
- o->put();
- --num;
- ++trimmed;
- }
- return trimmed;
+ std::unique_lock<std::mutex> l(flush_lock);
+ dout(20) << __func__ << " " << flush_txns << dendl;
+ while (!flush_txns.empty())
+ flush_cond.wait(l);
+ dout(20) << __func__ << " done" << dendl;
}
+
+
// =======================================================
// Collection
cid(c),
lock("BlueStore::Collection::lock", true, false),
exists(true),
- bnode_set(g_conf->bluestore_onode_map_size),
- onode_map(g_conf->bluestore_onode_map_size)
+ bnode_set(MAX(16, g_conf->bluestore_onode_cache_size / 128)),
+ onode_map(&ns->cache)
{
}
return OnodeRef();
// new
- on = new Onode(oid, key, &buffer_cache);
+ on = new Onode(&onode_map, oid, key, &cache);
} else {
// loaded
assert(r >=0);
- on = new Onode(oid, key, &buffer_cache);
+ on = new Onode(&onode_map, oid, key, &cache);
on->exists = true;
bufferlist::iterator p = v.begin();
::decode(on->onode, p);
break;
}
- if (txc->first_collection) {
- RWLock::WLocker l(txc->first_collection->lock);
- txc->first_collection->onode_map.trim();
- txc->first_collection->buffer_cache.trim(
- g_conf->bluestore_collection_buffer_cache_size);
- }
-
osr->q.pop_front();
txc->log_state_latency(logger, l_bluestore_state_done_lat);
delete txc;
if (osr->q.empty())
dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
}
+
+ cache.trim(g_conf->bluestore_onode_cache_size,
+ g_conf->bluestore_buffer_cache_size);
}
void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
++p, ++j) {
cvec[j] = _get_collection(*p);
-
- // note first collection we reference
- if (!j && !txc->first_collection)
- txc->first_collection = cvec[j];
}
vector<OnodeRef> ovec(i.objects.size());
typedef map<const bluestore_pextent_t*, regions2read_t> extents2read_t;
typedef map<uint64_t, bufferlist> ready_regions_t;
- /// cached buffer
struct BufferSpace;
+
+ /// cached buffer
struct Buffer {
enum {
STATE_UNDEF = 0,
}
};
- /// manage a collection of buffers (per-collection, currently)
- struct BufferCache {
- typedef boost::intrusive::list<
- Buffer,
- boost::intrusive::member_hook<
- Buffer,
- boost::intrusive::list_member_hook<>,
- &Buffer::lru_item> > buffer_lru_list_t;
-
- buffer_lru_list_t lru;
- uint64_t size = 0;
-
- void trim(uint64_t keep);
-
-#ifdef DEBUG_CACHE
- void audit_lru();
-#else
- void audit_lru() { /* no-op */ }
-#endif
- };
+ struct Cache;
/// map logical extent range (object) onto buffers
struct BufferSpace {
&Buffer::state_item> > state_list_t;
map<uint64_t,std::unique_ptr<Buffer>> buffer_map;
- BufferCache *cache;
+ Cache *cache;
state_list_t writing;
- BufferSpace(BufferCache *c) : cache(c) {}
+ BufferSpace(Cache *c) : cache(c) {}
void _add_buffer(Buffer *b) {
+ cache->_audit_lru();
buffer_map[b->offset].reset(b);
- cache->lru.push_front(*b);
- cache->size += b->length;
+ cache->buffer_lru.push_front(*b);
+ cache->buffer_size += b->length;
if (b->is_writing()) {
writing.push_back(*b);
}
- cache->audit_lru();
+ cache->_audit_lru();
}
void _rm_buffer(Buffer *b) {
_rm_buffer(buffer_map.find(b->offset));
}
void _rm_buffer(map<uint64_t,std::unique_ptr<Buffer>>::iterator p) {
- cache->size -= p->second->length;
- cache->lru.erase(cache->lru.iterator_to(*p->second));
+ cache->_audit_lru();
+ cache->buffer_size -= p->second->length;
+ cache->buffer_lru.erase(cache->buffer_lru.iterator_to(*p->second));
if (p->second->is_writing()) {
writing.erase(writing.iterator_to(*p->second));
}
buffer_map.erase(p);
- cache->audit_lru();
- }
-
- /// move to top of lru
- void _touch_buffer(Buffer *b) {
- auto p = cache->lru.iterator_to(*b);
- cache->lru.erase(p);
- cache->lru.push_front(*b);
+ cache->_audit_lru();
}
map<uint64_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
return buffer_map.empty();
}
- void discard(uint64_t offset, uint64_t length);
+ void _clear();
+
+ void discard(uint64_t offset, uint64_t length) {
+ std::lock_guard<std::mutex> l(cache->lock);
+ _discard(offset, length);
+ }
+ void _discard(uint64_t offset, uint64_t length);
void write(uint64_t seq, uint64_t offset, bufferlist& bl, unsigned flags) {
- discard(offset, bl.length());
+ std::lock_guard<std::mutex> l(cache->lock);
+ _discard(offset, bl.length());
_add_buffer(new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
flags));
}
void finish_write(uint64_t seq);
void did_read(uint64_t offset, bufferlist& bl) {
- discard(offset, bl.length());
+ std::lock_guard<std::mutex> l(cache->lock);
+ _discard(offset, bl.length());
_add_buffer(new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl));
}
}
void dump(Formatter *f) const {
+ std::lock_guard<std::mutex> l(cache->lock);
f->open_array_section("buffers");
for (auto& i : buffer_map) {
f->open_object_section("buffer");
}
};
- /// an in-memory extent-map, shared by a group of objects (w/ same hash value)
struct BnodeSet;
+ /// an in-memory extent-map, shared by a group of objects (w/ same hash value)
struct Bnode : public boost::intrusive::unordered_set_base_hook<> {
std::atomic_int nref; ///< reference count
uint32_t hash;
}
};
+ struct OnodeSpace;
+
/// an in-memory object
struct Onode {
std::atomic_int nref; ///< reference count
ghobject_t oid;
string key; ///< key under PREFIX_OBJ where we are stored
+
+ OnodeSpace *space; ///< containing OnodeSpace
boost::intrusive::list_member_hook<> lru_item;
BnodeRef bnode; ///< ref to Bnode [optional]
BufferSpace bc;
- Onode(const ghobject_t& o, const string& k, BufferCache *c)
+ Onode(OnodeSpace *s, const ghobject_t& o, const string& k, Cache *c)
: nref(0),
oid(o),
key(k),
+ space(s),
exists(false),
bc(c) {
}
};
typedef boost::intrusive_ptr<Onode> OnodeRef;
- struct OnodeHashLRU {
+ /// a cache (shard) of onodes and buffers
+ struct Cache {
+ typedef boost::intrusive::list<
+ Buffer,
+ boost::intrusive::member_hook<
+ Buffer,
+ boost::intrusive::list_member_hook<>,
+ &Buffer::lru_item> > buffer_lru_list_t;
typedef boost::intrusive::list<
Onode,
boost::intrusive::member_hook<
boost::intrusive::list_member_hook<>,
&Onode::lru_item> > onode_lru_list_t;
- std::mutex lock;
+ std::mutex lock; ///< protect lru and other structures
+ buffer_lru_list_t buffer_lru;
+ uint64_t buffer_size = 0;
+ onode_lru_list_t onode_lru;
+
+ void _touch_onode(OnodeRef& o);
+
+ void _touch_buffer(Buffer *b) {
+ auto p = buffer_lru.iterator_to(*b);
+ buffer_lru.erase(p);
+ buffer_lru.push_front(*b);
+ _audit_lru();
+ }
+
+ void trim(uint64_t onode_max, uint64_t buffer_max);
+
+#ifdef DEBUG_CACHE
+ void _audit_lru();
+#else
+ void _audit_lru() { /* no-op */ }
+#endif
+ };
+
+ struct OnodeSpace {
+ Cache *cache;
ceph::unordered_map<ghobject_t,OnodeRef> onode_map; ///< forward lookups
- onode_lru_list_t lru; ///< lru
- size_t max_size;
- OnodeHashLRU(size_t s) : max_size(s) {}
+ OnodeSpace(Cache *c) : cache(c) {}
+ ~OnodeSpace() {
+ clear();
+ }
void add(const ghobject_t& oid, OnodeRef o);
- void _touch(OnodeRef o);
OnodeRef lookup(const ghobject_t& o);
void rename(OnodeRef& o, const ghobject_t& old_oid, const ghobject_t& new_oid);
void clear();
bool get_next(const ghobject_t& after, pair<ghobject_t,OnodeRef> *next);
- int trim(int max=-1);
- int _trim(int max);
};
struct Collection : public CollectionImpl {
// cache onodes on a per-collection basis to avoid lock
// contention.
- OnodeHashLRU onode_map;
- BufferCache buffer_cache;
+ OnodeSpace onode_map;
+ Cache cache;
OnodeRef get_onode(const ghobject_t& oid, bool create);
BnodeRef get_bnode(uint32_t hash);
IOContext ioc;
- CollectionRef first_collection; ///< first referenced collection
-
uint64_t seq = 0;
utime_t start;
RWLock coll_lock; ///< rwlock to protect coll_map
ceph::unordered_map<coll_t, CollectionRef> coll_map;
+ Cache cache;
+
std::mutex nid_lock;
uint64_t nid_last;
uint64_t nid_max;