return expected_for_release - expected_allocations;
}
-// Cache
+// LruOnodeCacheShard
+struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Onode,
+ boost::intrusive::member_hook<
+ BlueStore::Onode,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Onode::lru_item> > list_t;
+ list_t lru;
+
+ explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
+
+ void _add(BlueStore::OnodeRef& o, int level) override
+ {
+ (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+ num = lru.size();
+ }
+ void _rm(BlueStore::OnodeRef& o) override
+ {
+ lru.erase(lru.iterator_to(*o));
+ num = lru.size();
+ }
+ void _touch(BlueStore::OnodeRef& o) override
+ {
+ lru.erase(lru.iterator_to(*o));
+ lru.push_front(*o);
+ num = lru.size();
+ }
+ void _trim_to(uint64_t max) override
+ {
+ if (max >= lru.size()) {
+ return; // don't even try
+ }
+ uint64_t n = lru.size() - max;
-BlueStore::Cache *BlueStore::Cache::create(CephContext* cct, string type,
- PerfCounters *logger)
-{
- Cache *c = nullptr;
+ auto p = lru.end();
+ ceph_assert(p != lru.begin());
+ --p;
+ int skipped = 0;
+ int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
+ while (n > 0) {
+ BlueStore::Onode *o = &*p;
+ int refs = o->nref.load();
+ if (refs > 1) {
+ dout(20) << __func__ << " " << o->oid << " has " << refs
+ << " refs, skipping" << dendl;
+ if (++skipped >= max_skipped) {
+ dout(20) << __func__ << " maximum skip pinned reached; stopping with "
+ << n << " left to trim" << dendl;
+ break;
+ }
- if (type == "lru")
- c = new LRUCache(cct);
- else if (type == "2q")
- c = new TwoQCache(cct);
- else
- ceph_abort_msg("unrecognized cache type");
+ if (p == lru.begin()) {
+ break;
+ } else {
+ p--;
+ n--;
+ continue;
+ }
+ }
+ dout(30) << __func__ << " rm " << o->oid << dendl;
+ if (p != lru.begin()) {
+ lru.erase(p--);
+ } else {
+ lru.erase(p);
+ ceph_assert(n == 1);
+ }
+ o->get(); // paranoia
+ o->c->onode_map.remove(o->oid);
+ o->put();
+ --n;
+ }
+ num = lru.size();
+ }
+ void add_stats(uint64_t *onodes) override
+ {
+ *onodes += num;
+ }
+};
+// OnodeCacheShard
+BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
+ CephContext* cct,
+ string type,
+ PerfCounters *logger)
+{
+ BlueStore::OnodeCacheShard *c = nullptr;
+ // Currently we only implement an LRU cache for onodes
+ c = new LruOnodeCacheShard(cct);
c->logger = logger;
return c;
}
-void BlueStore::Cache::trim_onodes()
-{
- std::lock_guard l(lock);
- _trim_onodes();
-}
-
-void BlueStore::Cache::trim_buffers()
-{
- std::lock_guard l(lock);
- _trim_buffers();
-}
-
-void BlueStore::Cache::flush()
-{
- std::lock_guard l(lock);
- _trim_buffers_to(0);
- _trim_onodes_to(0);
-}
+// LruBufferCacheShard
+struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Buffer,
+ boost::intrusive::member_hook<
+ BlueStore::Buffer,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Buffer::lru_item> > list_t;
+ list_t lru;
+
+ explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
+
+ void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
+ if (near) {
+ auto q = lru.iterator_to(*near);
+ lru.insert(q, *b);
+ } else if (level > 0) {
+ lru.push_front(*b);
+ } else {
+ lru.push_back(*b);
+ }
+ buffer_bytes += b->length;
+ num = lru.size();
+ }
+ void _rm(BlueStore::Buffer *b) override {
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ auto q = lru.iterator_to(*b);
+ lru.erase(q);
+ num = lru.size();
+ }
+ void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
+ src->_rm(b);
+ _add(b, 0, nullptr);
+ }
+ void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
+ ceph_assert((int64_t)buffer_bytes + delta >= 0);
+ buffer_bytes += delta;
+ }
+ void _touch(BlueStore::Buffer *b) override {
+ auto p = lru.iterator_to(*b);
+ lru.erase(p);
+ lru.push_front(*b);
+ num = lru.size();
+ _audit("_touch_buffer end");
+ }
-// LRUCache
-#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.LRUCache(" << this << ") "
-
-void BlueStore::LRUCache::_touch_onode(OnodeRef& o)
-{
- auto p = onode_lru.iterator_to(*o);
- onode_lru.erase(p);
- onode_lru.push_front(*o);
-}
-
-void BlueStore::LRUCache::_trim_onodes_to(uint64_t max) {
- if (max >= onode_lru.size()) {
- return; // don't even try
- }
- uint64_t num = onode_lru.size() - max;
-
- auto p = onode_lru.end();
- ceph_assert(p != onode_lru.begin());
- --p;
- int skipped = 0;
- int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
- while (num > 0) {
- Onode *o = &*p;
- int refs = o->nref.load();
- if (refs > 1) {
- dout(20) << __func__ << " " << o->oid << " has " << refs
- << " refs, skipping" << dendl;
- if (++skipped >= max_skipped) {
- dout(20) << __func__ << " maximum skip pinned reached; stopping with "
- << num << " left to trim" << dendl;
+ void _trim_to(uint64_t max) override
+ {
+ while (buffer_bytes > max) {
+ auto i = lru.rbegin();
+ if (i == lru.rend()) {
+ // stop if lru is now empty
break;
}
- if (p == onode_lru.begin()) {
- break;
- } else {
- p--;
- num--;
- continue;
- }
- }
- dout(30) << __func__ << " rm " << o->oid << dendl;
- if (p != onode_lru.begin()) {
- onode_lru.erase(p--);
- } else {
- onode_lru.erase(p);
- ceph_assert(num == 1);
+ BlueStore::Buffer *b = &*i;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " rm " << *b << dendl;
+ b->space->_rm_buffer(this, b);
}
- o->get(); // paranoia
- o->c->onode_map.remove(o->oid);
- o->put();
- --num;
+ num = lru.size();
}
-}
-
-void BlueStore::LRUCache::_trim_buffers_to(uint64_t max) {
- while (buffer_size > max) {
- auto i = buffer_lru.rbegin();
- if (i == buffer_lru.rend()) {
- // stop if buffer_lru is now empty
- break;
- }
- Buffer *b = &*i;
- ceph_assert(b->is_clean());
- dout(20) << __func__ << " rm " << *b << dendl;
- b->space->_rm_buffer(this, b);
+ void add_stats(uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += num;
+ *bytes += buffer_bytes;
}
-}
-
#ifdef DEBUG_CACHE
-void BlueStore::LRUCache::_audit(const char *when)
-{
- dout(10) << __func__ << " " << when << " start" << dendl;
- uint64_t s = 0;
- for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
- s += i->length;
- }
- if (s != buffer_size) {
- derr << __func__ << " buffer_size " << buffer_size << " actual " << s
- << dendl;
- for (auto i = buffer_lru.begin(); i != buffer_lru.end(); ++i) {
- derr << __func__ << " " << *i << dendl;
+ void _audit(const char *s) override
+ {
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = lru.begin(); i != lru.end(); ++i) {
+ s += i->length;
}
- ceph_assert(s == buffer_size);
+ if (s != buffer_bytes) {
+ derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
+ << dendl;
+ for (auto i = lru.begin(); i != lru.end(); ++i) {
+ derr << __func__ << " " << *i << dendl;
+ }
+ ceph_assert(s == buffer_bytes);
+ }
+ dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+ << " ok" << dendl;
}
- dout(20) << __func__ << " " << when << " buffer_size " << buffer_size
- << " ok" << dendl;
-}
#endif
+};
-// TwoQCache
-#undef dout_prefix
-#define dout_prefix *_dout << "bluestore.2QCache(" << this << ") "
+// TwoQBufferCacheShard
+
+struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
+ typedef boost::intrusive::list<
+ BlueStore::Buffer,
+ boost::intrusive::member_hook<
+ BlueStore::Buffer,
+ boost::intrusive::list_member_hook<>,
+ &BlueStore::Buffer::lru_item> > list_t;
+ list_t hot; ///< "Am" hot buffers
+ list_t warm_in; ///< "A1in" newly warm buffers
+ list_t warm_out; ///< "A1out" empty buffers we've evicted
+ uint64_t buffer_bytes = 0; ///< bytes
+
+ enum {
+ BUFFER_NEW = 0,
+ BUFFER_WARM_IN, ///< in warm_in
+ BUFFER_WARM_OUT, ///< in warm_out
+ BUFFER_HOT, ///< in hot
+ BUFFER_TYPE_MAX
+ };
+ uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
-void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
-{
- auto p = onode_lru.iterator_to(*o);
- onode_lru.erase(p);
- onode_lru.push_front(*o);
-}
+public:
+ explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
-void BlueStore::TwoQCache::_add_buffer(Buffer *b, int level, Buffer *near)
-{
- dout(20) << __func__ << " level " << level << " near " << near
- << " on " << *b
- << " which has cache_private " << b->cache_private << dendl;
- if (near) {
- b->cache_private = near->cache_private;
+ void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
+ {
+ dout(20) << __func__ << " level " << level << " near " << near
+ << " on " << *b
+ << " which has cache_private " << b->cache_private << dendl;
+ if (near) {
+ b->cache_private = near->cache_private;
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ warm_in.insert(warm_in.iterator_to(*near), *b);
+ break;
+ case BUFFER_WARM_OUT:
+ ceph_assert(b->is_empty());
+ warm_out.insert(warm_out.iterator_to(*near), *b);
+ break;
+ case BUFFER_HOT:
+ hot.insert(hot.iterator_to(*near), *b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ } else if (b->cache_private == BUFFER_NEW) {
+ b->cache_private = BUFFER_WARM_IN;
+ if (level > 0) {
+ warm_in.push_front(*b);
+ } else {
+ // take caller hint to start at the back of the warm queue
+ warm_in.push_back(*b);
+ }
+ } else {
+ // we got a hint from discard
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // stay in warm_in. move to front, even though 2Q doesn't actually
+ // do this.
+ dout(20) << __func__ << " move to front of warm " << *b << dendl;
+ warm_in.push_front(*b);
+ break;
+ case BUFFER_WARM_OUT:
+ b->cache_private = BUFFER_HOT;
+ // move to hot. fall-thru
+ case BUFFER_HOT:
+ dout(20) << __func__ << " move to front of hot " << *b << dendl;
+ hot.push_front(*b);
+ break;
+ default:
+ ceph_abort_msg("bad cache_private");
+ }
+ }
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ list_bytes[b->cache_private] += b->length;
+ }
+ num = hot.size() + warm_in.size();
+ }
+
+ void _rm(BlueStore::Buffer *b) override
+ {
+ dout(20) << __func__ << " " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(list_bytes[b->cache_private] >= b->length);
+ list_bytes[b->cache_private] -= b->length;
+ }
switch (b->cache_private) {
case BUFFER_WARM_IN:
- buffer_warm_in.insert(buffer_warm_in.iterator_to(*near), *b);
+ warm_in.erase(warm_in.iterator_to(*b));
break;
case BUFFER_WARM_OUT:
- ceph_assert(b->is_empty());
- buffer_warm_out.insert(buffer_warm_out.iterator_to(*near), *b);
+ warm_out.erase(warm_out.iterator_to(*b));
break;
case BUFFER_HOT:
- buffer_hot.insert(buffer_hot.iterator_to(*near), *b);
+ hot.erase(hot.iterator_to(*b));
break;
default:
ceph_abort_msg("bad cache_private");
}
- } else if (b->cache_private == BUFFER_NEW) {
- b->cache_private = BUFFER_WARM_IN;
- if (level > 0) {
- buffer_warm_in.push_front(*b);
- } else {
- // take caller hint to start at the back of the warm queue
- buffer_warm_in.push_back(*b);
- }
- } else {
- // we got a hint from discard
+ num = hot.size() + warm_in.size();
+ }
+
+ void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
+ {
+ TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
+ src->_rm(b);
+
+ // preserve which list we're on (even if we can't preserve the order!)
switch (b->cache_private) {
case BUFFER_WARM_IN:
- // stay in warm_in. move to front, even though 2Q doesn't actually
- // do this.
- dout(20) << __func__ << " move to front of warm " << *b << dendl;
- buffer_warm_in.push_front(*b);
+ ceph_assert(!b->is_empty());
+ warm_in.push_back(*b);
break;
case BUFFER_WARM_OUT:
- b->cache_private = BUFFER_HOT;
- // move to hot. fall-thru
+ ceph_assert(b->is_empty());
+ warm_out.push_back(*b);
+ break;
case BUFFER_HOT:
- dout(20) << __func__ << " move to front of hot " << *b << dendl;
- buffer_hot.push_front(*b);
+ ceph_assert(!b->is_empty());
+ hot.push_back(*b);
break;
default:
ceph_abort_msg("bad cache_private");
}
+ if (!b->is_empty()) {
+ buffer_bytes += b->length;
+ list_bytes[b->cache_private] += b->length;
+ }
+ num = hot.size() + warm_in.size();
}
- if (!b->is_empty()) {
- buffer_bytes += b->length;
- buffer_list_bytes[b->cache_private] += b->length;
- }
-}
-void BlueStore::TwoQCache::_rm_buffer(Buffer *b)
-{
- dout(20) << __func__ << " " << *b << dendl;
- if (!b->is_empty()) {
- ceph_assert(buffer_bytes >= b->length);
- buffer_bytes -= b->length;
- ceph_assert(buffer_list_bytes[b->cache_private] >= b->length);
- buffer_list_bytes[b->cache_private] -= b->length;
- }
- switch (b->cache_private) {
- case BUFFER_WARM_IN:
- buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
- break;
- case BUFFER_WARM_OUT:
- buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));
- break;
- case BUFFER_HOT:
- buffer_hot.erase(buffer_hot.iterator_to(*b));
- break;
- default:
- ceph_abort_msg("bad cache_private");
+ void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
+ {
+ dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
+ if (!b->is_empty()) {
+ ceph_assert((int64_t)buffer_bytes + delta >= 0);
+ buffer_bytes += delta;
+ ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
+ list_bytes[b->cache_private] += delta;
+ }
}
-}
-void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b)
-{
- TwoQCache *src = static_cast<TwoQCache*>(srcc);
- src->_rm_buffer(b);
-
- // preserve which list we're on (even if we can't preserve the order!)
- switch (b->cache_private) {
- case BUFFER_WARM_IN:
- ceph_assert(!b->is_empty());
- buffer_warm_in.push_back(*b);
- break;
- case BUFFER_WARM_OUT:
- ceph_assert(b->is_empty());
- buffer_warm_out.push_back(*b);
- break;
- case BUFFER_HOT:
- ceph_assert(!b->is_empty());
- buffer_hot.push_back(*b);
- break;
- default:
- ceph_abort_msg("bad cache_private");
- }
- if (!b->is_empty()) {
- buffer_bytes += b->length;
- buffer_list_bytes[b->cache_private] += b->length;
+ void _touch(BlueStore::Buffer *b) override {
+ switch (b->cache_private) {
+ case BUFFER_WARM_IN:
+ // do nothing (somewhat counter-intuitively!)
+ break;
+ case BUFFER_WARM_OUT:
+ // move from warm_out to hot LRU
+ ceph_abort_msg("this happens via discard hint");
+ break;
+ case BUFFER_HOT:
+ // move to front of hot LRU
+ hot.erase(hot.iterator_to(*b));
+ hot.push_front(*b);
+ break;
+ }
+ num = hot.size() + warm_in.size();
+ _audit("_touch_buffer end");
}
-}
-void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta)
-{
- dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
- if (!b->is_empty()) {
- ceph_assert((int64_t)buffer_bytes + delta >= 0);
- buffer_bytes += delta;
- ceph_assert((int64_t)buffer_list_bytes[b->cache_private] + delta >= 0);
- buffer_list_bytes[b->cache_private] += delta;
- }
-}
-
-void BlueStore::TwoQCache::_trim_onodes_to(uint64_t max) {
- if (max >= onode_lru.size()) {
- return; // don't even try
- }
- uint64_t num = onode_lru.size() - max;
-
- auto p = onode_lru.end();
- ceph_assert(p != onode_lru.begin());
- --p;
- int skipped = 0;
- int max_skipped = g_conf()->bluestore_cache_trim_max_skip_pinned;
- while (num > 0) {
- Onode *o = &*p;
- dout(20) << __func__ << " considering " << o << dendl;
- int refs = o->nref.load();
- if (refs > 1) {
- dout(20) << __func__ << " " << o->oid << " has " << refs
- << " refs; skipping" << dendl;
- if (++skipped >= max_skipped) {
- dout(20) << __func__ << " maximum skip pinned reached; stopping with "
- << num << " left to trim" << dendl;
- break;
- }
+ void _trim_to(uint64_t max) override
+ {
+ if (buffer_bytes > max) {
+ uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
+ uint64_t khot = max - kin;
+
+ // pre-calculate kout based on average buffer size too,
+ // which is typical(the warm_in and hot lists may change later)
+ uint64_t kout = 0;
+ uint64_t buffer_num = hot.size() + warm_in.size();
+ if (buffer_num) {
+ uint64_t avg_size = buffer_bytes / buffer_num;
+ ceph_assert(avg_size);
+ uint64_t calculated_num = max / avg_size;
+ kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
+ }
+
+ if (list_bytes[BUFFER_HOT] < khot) {
+ // hot is small, give slack to warm_in
+ kin += khot - list_bytes[BUFFER_HOT];
+ } else if (list_bytes[BUFFER_WARM_IN] < kin) {
+ // warm_in is small, give slack to hot
+ khot += kin - list_bytes[BUFFER_WARM_IN];
+ }
+
+ // adjust warm_in list
+ int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
+ uint64_t evicted = 0;
+
+ while (to_evict_bytes > 0) {
+ auto p = warm_in.rbegin();
+ if (p == warm_in.rend()) {
+ // stop if warm_in list is now empty
+ break;
+ }
- if (p == onode_lru.begin()) {
- break;
- } else {
- p--;
- num--;
- continue;
+ BlueStore::Buffer *b = &*p;
+ ceph_assert(b->is_clean());
+ dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+ ceph_assert(buffer_bytes >= b->length);
+ buffer_bytes -= b->length;
+ ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
+ list_bytes[BUFFER_WARM_IN] -= b->length;
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->state = BlueStore::Buffer::STATE_EMPTY;
+ b->data.clear();
+ warm_in.erase(warm_in.iterator_to(*b));
+ warm_out.push_front(*b);
+ b->cache_private = BUFFER_WARM_OUT;
+ }
+
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from warm_in list, done evicting warm_in buffers"
+ << dendl;
}
- }
- dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
- if (p != onode_lru.begin()) {
- onode_lru.erase(p--);
- } else {
- onode_lru.erase(p);
- ceph_assert(num == 1);
- }
- o->get(); // paranoia
- o->c->onode_map.remove(o->oid);
- o->put();
- --num;
- }
-}
-
-void BlueStore::TwoQCache::_trim_buffers_to(uint64_t max) {
- if (buffer_bytes > max) {
- uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
- uint64_t khot = max - kin;
- // pre-calculate kout based on average buffer size too,
- // which is typical(the warm_in and hot lists may change later)
- uint64_t kout = 0;
- uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size();
- if (buffer_num) {
- uint64_t buffer_avg_size = buffer_bytes / buffer_num;
- ceph_assert(buffer_avg_size);
- uint64_t calculated_buffer_num = max / buffer_avg_size;
- kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;
- }
+ // adjust hot list
+ to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
+ evicted = 0;
- if (buffer_list_bytes[BUFFER_HOT] < khot) {
- // hot is small, give slack to warm_in
- kin += khot - buffer_list_bytes[BUFFER_HOT];
- } else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) {
- // warm_in is small, give slack to hot
- khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
- }
+ while (to_evict_bytes > 0) {
+ auto p = hot.rbegin();
+ if (p == hot.rend()) {
+ // stop if hot list is now empty
+ break;
+ }
- // adjust warm_in list
- int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;
- uint64_t evicted = 0;
+ BlueStore::Buffer *b = &*p;
+ dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+ ceph_assert(b->is_clean());
+ // adjust evict size before buffer goes invalid
+ to_evict_bytes -= b->length;
+ evicted += b->length;
+ b->space->_rm_buffer(this, b);
+ }
- while (to_evict_bytes > 0) {
- auto p = buffer_warm_in.rbegin();
- if (p == buffer_warm_in.rend()) {
- // stop if warm_in list is now empty
- break;
+ if (evicted > 0) {
+ dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+ << " from hot list, done evicting hot buffers"
+ << dendl;
}
- Buffer *b = &*p;
- ceph_assert(b->is_clean());
- dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
- ceph_assert(buffer_bytes >= b->length);
- buffer_bytes -= b->length;
- ceph_assert(buffer_list_bytes[BUFFER_WARM_IN] >= b->length);
- buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
- to_evict_bytes -= b->length;
- evicted += b->length;
- b->state = Buffer::STATE_EMPTY;
- b->data.clear();
- buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
- buffer_warm_out.push_front(*b);
- b->cache_private = BUFFER_WARM_OUT;
- }
-
- if (evicted > 0) {
- dout(20) << __func__ << " evicted " << byte_u_t(evicted)
- << " from warm_in list, done evicting warm_in buffers"
- << dendl;
+ // adjust warm out list too, if necessary
+ int64_t n = warm_out.size() - kout;
+ while (n-- > 0) {
+ BlueStore::Buffer *b = &*warm_out.rbegin();
+ ceph_assert(b->is_empty());
+ dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+ b->space->_rm_buffer(this, b);
+ }
}
+ num = hot.size() + warm_in.size();
+ }
- // adjust hot list
- to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
- evicted = 0;
-
- while (to_evict_bytes > 0) {
- auto p = buffer_hot.rbegin();
- if (p == buffer_hot.rend()) {
- // stop if hot list is now empty
- break;
- }
+ void add_stats(uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) override {
+ *extents += num_extents;
+ *blobs += num_blobs;
+ *buffers += num;
+ *bytes += buffer_bytes;
+ }
- Buffer *b = &*p;
- dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
- ceph_assert(b->is_clean());
- // adjust evict size before buffer goes invalid
- to_evict_bytes -= b->length;
- evicted += b->length;
- b->space->_rm_buffer(this, b);
+#ifdef DEBUG_CACHE
+ void _audit(const char *s) override
+ {
+ dout(10) << __func__ << " " << when << " start" << dendl;
+ uint64_t s = 0;
+ for (auto i = hot.begin(); i != hot.end(); ++i) {
+ s += i->length;
}
- if (evicted > 0) {
- dout(20) << __func__ << " evicted " << byte_u_t(evicted)
- << " from hot list, done evicting hot buffers"
- << dendl;
+ uint64_t hot_bytes = s;
+ if (hot_bytes != list_bytes[BUFFER_HOT]) {
+ derr << __func__ << " hot_list_bytes "
+ << list_bytes[BUFFER_HOT]
+ << " != actual " << hot_bytes
+ << dendl;
+ ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
}
- // adjust warm out list too, if necessary
- int64_t num = buffer_warm_out.size() - kout;
- while (num-- > 0) {
- Buffer *b = &*buffer_warm_out.rbegin();
- ceph_assert(b->is_empty());
- dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
- b->space->_rm_buffer(this, b);
+ for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
+ s += i->length;
}
- }
-}
-
-#ifdef DEBUG_CACHE
-void BlueStore::TwoQCache::_audit(const char *when)
-{
- dout(10) << __func__ << " " << when << " start" << dendl;
- uint64_t s = 0;
- for (auto i = buffer_hot.begin(); i != buffer_hot.end(); ++i) {
- s += i->length;
- }
- uint64_t hot_bytes = s;
- if (hot_bytes != buffer_list_bytes[BUFFER_HOT]) {
- derr << __func__ << " hot_list_bytes "
- << buffer_list_bytes[BUFFER_HOT]
- << " != actual " << hot_bytes
- << dendl;
- ceph_assert(hot_bytes == buffer_list_bytes[BUFFER_HOT]);
- }
+ uint64_t warm_in_bytes = s - hot_bytes;
+ if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
+ derr << __func__ << " warm_in_list_bytes "
+ << list_bytes[BUFFER_WARM_IN]
+ << " != actual " << warm_in_bytes
+ << dendl;
+ ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
+ }
- for (auto i = buffer_warm_in.begin(); i != buffer_warm_in.end(); ++i) {
- s += i->length;
- }
+ if (s != buffer_bytes) {
+ derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+ << dendl;
+ ceph_assert(s == buffer_bytes);
+ }
- uint64_t warm_in_bytes = s - hot_bytes;
- if (warm_in_bytes != buffer_list_bytes[BUFFER_WARM_IN]) {
- derr << __func__ << " warm_in_list_bytes "
- << buffer_list_bytes[BUFFER_WARM_IN]
- << " != actual " << warm_in_bytes
- << dendl;
- ceph_assert(warm_in_bytes == buffer_list_bytes[BUFFER_WARM_IN]);
+ dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+ << " ok" << dendl;
}
+#endif
+};
- if (s != buffer_bytes) {
- derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
- << dendl;
- ceph_assert(s == buffer_bytes);
- }
+// BuferCacheShard
- dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
- << " ok" << dendl;
+BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
+ CephContext* cct,
+ string type,
+ PerfCounters *logger)
+{
+ BufferCacheShard *c = nullptr;
+ if (type == "lru")
+ c = new LruBufferCacheShard(cct);
+ else if (type == "2q")
+ c = new TwoQBufferCacheShard(cct);
+ else
+ ceph_abort_msg("unrecognized cache type");
+ c->logger = logger;
+ return c;
}
-#endif
-
// BufferSpace
#undef dout_prefix
#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
-void BlueStore::BufferSpace::_clear(Cache* cache)
+void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
{
// note: we already hold cache->lock
ldout(cache->cct, 20) << __func__ << dendl;
}
}
-int BlueStore::BufferSpace::_discard(Cache* cache, uint32_t offset, uint32_t length)
+int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
{
// note: we already hold cache->lock
ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
0, b);
}
if (!b->is_writing()) {
- cache->_adjust_buffer_size(b, front - (int64_t)b->length);
+ cache->_adjust_size(b, front - (int64_t)b->length);
}
b->truncate(front);
b->maybe_rebuild();
} else {
// drop tail
if (!b->is_writing()) {
- cache->_adjust_buffer_size(b, front - (int64_t)b->length);
+ cache->_adjust_size(b, front - (int64_t)b->length);
}
b->truncate(front);
b->maybe_rebuild();
cache->_audit("discard end 2");
break;
}
- cache->_trim_buffers();
return cache_private;
}
void BlueStore::BufferSpace::read(
- Cache* cache,
+ BufferCacheShard* cache,
uint32_t offset,
uint32_t length,
BlueStore::ready_regions_t& res,
offset += l;
length -= l;
if (!b->is_writing()) {
- cache->_touch_buffer(b);
+ cache->_touch(b);
}
continue;
}
length -= gap;
}
if (!b->is_writing()) {
- cache->_touch_buffer(b);
+ cache->_touch(b);
}
if (b->length > length) {
res[offset].substr_of(b->data, 0, length);
cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
}
-void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
+void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
{
auto i = writing.begin();
while (i != writing.end()) {
writing.erase(i++);
b->maybe_rebuild();
b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
- cache->_add_buffer(b, 1, nullptr);
+ cache->_add(b, 1, nullptr);
ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
}
}
- cache->_trim_buffers();
+ cache->_trim();
cache->_audit("finish_write end");
}
-void BlueStore::BufferSpace::split(Cache* cache, size_t pos, BlueStore::BufferSpace &r)
+void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
{
std::lock_guard lk(cache->lock);
if (buffer_map.empty())
r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq, 0, right),
0, p->second.get());
}
- cache->_adjust_buffer_size(p->second.get(), -right);
+ cache->_adjust_size(p->second.get(), -right);
p->second->truncate(left);
break;
}
}
}
ceph_assert(writing.empty());
- cache->_trim_buffers();
+ cache->_trim();
}
// OnodeSpace
}
ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
onode_map[oid] = o;
- cache->_add_onode(o, 1);
- cache->_trim_onodes();
+ cache->_add(o, 1);
+ cache->_trim();
return o;
}
} else {
ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
<< dendl;
- cache->_touch_onode(p->second);
+ cache->_touch(p->second);
hit = true;
o = p->second;
}
std::lock_guard l(cache->lock);
ldout(cache->cct, 10) << __func__ << dendl;
for (auto &p : onode_map) {
- cache->_rm_onode(p.second);
+ cache->_rm(p.second);
}
onode_map.clear();
}
if (pn != onode_map.end()) {
ldout(cache->cct, 30) << __func__ << " removing target " << pn->second
<< dendl;
- cache->_rm_onode(pn->second);
+ cache->_rm(pn->second);
onode_map.erase(pn);
}
OnodeRef o = po->second;
// install a non-existent onode at old location
oldo.reset(new Onode(o->c, old_oid, o->key));
po->second = oldo;
- cache->_add_onode(po->second, 1);
- cache->_trim_onodes();
+ cache->_add(po->second, 1);
// add at new position and fix oid, key
onode_map.insert(make_pair(new_oid, o));
- cache->_touch_onode(o);
+ cache->_touch(o);
o->oid = new_oid;
o->key = new_okey;
+ cache->_trim();
}
bool BlueStore::OnodeSpace::map_any(std::function<bool(OnodeRef)> f)
void BlueStore::SharedBlob::finish_write(uint64_t seq)
{
while (true) {
- Cache *cache = coll->cache;
+ BufferCacheShard *cache = coll->cache;
std::lock_guard l(cache->lock);
if (coll->cache != cache) {
ldout(coll->store->cct, 20) << __func__
bool was_too_many_blobs_check = false;
auto too_many_blobs_threshold =
g_conf()->bluestore_debug_too_many_blobs_threshold;
- auto& dumped_onodes = onode->c->cache->dumped_onodes;
- decltype(onode->c->cache->dumped_onodes)::value_type* oid_slot = nullptr;
- decltype(onode->c->cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+ auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
+ decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
+ decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
if (e->logical_offset >= needs_reshard_end) {
#undef dout_prefix
#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
-BlueStore::Collection::Collection(BlueStore *store_, Cache *c, coll_t cid)
+BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
: CollectionImpl(cid),
store(store_),
- cache(c),
+ cache(bc),
lock("BlueStore::Collection::lock", true, false),
exists(true),
- onode_map(c),
+ onode_map(oc),
commit_queue(nullptr)
{
}
ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
<< dendl;
- cache->_rm_onode(p->second);
+ onode_map.cache->_rm(p->second);
p = onode_map.onode_map.erase(p);
o->c = dest;
- dest->cache->_add_onode(o, 1);
+ dest->onode_map.cache->_add(o, 1);
dest->onode_map.onode_map[o->oid] = o;
- dest->onode_map.cache = dest->cache;
+ dest->onode_map.cache = dest->onode_map.cache;
// move over shared blobs and buffers. cover shared blobs from
// both extent map and spanning blob map (the full extent map
if (!i.second->is_writing()) {
ldout(store->cct, 20) << __func__ << " moving " << *i.second
<< dendl;
- dest->cache->_move_buffer(cache, i.second.get());
+ dest->cache->_move(cache, i.second.get());
}
}
}
}
}
}
- dest->cache->_trim_onodes();
+ dest->cache->_trim();
}
// =======================================================
void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
{
auto cct = store->cct;
- size_t num_shards = store->cache_shards.size();
-
+ size_t onode_shards = store->onode_cache_shards.size();
+ size_t buffer_shards = store->buffer_cache_shards.size();
int64_t kv_used = store->db->get_cache_usage();
int64_t meta_used = meta_cache->_get_used_bytes();
int64_t data_used = data_cache->_get_used_bytes();
}
uint64_t max_shard_onodes = static_cast<uint64_t>(
- (meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
- uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
+ (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
+ uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
<< " max_shard_buffer: " << max_shard_buffer << dendl;
- for (auto i : store->cache_shards) {
- i->set_onode_max(max_shard_onodes);
- i->set_buffer_max(max_shard_buffer);
+ for (auto i : store->onode_cache_shards) {
+ i->set_max(max_shard_onodes);
+ }
+ for (auto i : store->buffer_cache_shards) {
+ i->set_max(max_shard_buffer);
}
}
ceph_assert(bluefs == NULL);
ceph_assert(fsid_fd < 0);
ceph_assert(path_fd < 0);
- for (auto i : cache_shards) {
+ for (auto i : onode_cache_shards) {
delete i;
}
- cache_shards.clear();
+ for (auto i : buffer_cache_shards) {
+ delete i;
+ }
+ onode_cache_shards.clear();
+ buffer_cache_shards.clear();
}
const char **BlueStore::get_tracked_conf_keys() const
CollectionRef c(
new Collection(
this,
- cache_shards[cid.hash_to_shard(cache_shards.size())],
+ onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+ buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
cid));
bufferlist bl = it->value();
auto p = bl.cbegin();
void BlueStore::set_cache_shards(unsigned num)
{
dout(10) << __func__ << " " << num << dendl;
- size_t old = cache_shards.size();
- ceph_assert(num >= old);
- cache_shards.resize(num);
- for (unsigned i = old; i < num; ++i) {
- cache_shards[i] = Cache::create(cct, cct->_conf->bluestore_cache_type,
- logger);
+ size_t oold = onode_cache_shards.size();
+ size_t bold = buffer_cache_shards.size();
+ ceph_assert(num >= oold && num >= bold);
+ onode_cache_shards.resize(num);
+ buffer_cache_shards.resize(num);
+ for (unsigned i = oold; i < num; ++i) {
+ onode_cache_shards[i] =
+ OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+ logger);
+ }
+ for (unsigned i = bold; i < num; ++i) {
+ buffer_cache_shards[i] =
+ BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+ logger);
}
}
uint64_t num_blobs = 0;
uint64_t num_buffers = 0;
uint64_t num_buffer_bytes = 0;
- for (auto c : cache_shards) {
- c->add_stats(&num_onodes, &num_extents, &num_blobs,
- &num_buffers, &num_buffer_bytes);
+ for (auto c : onode_cache_shards) {
+ c->add_stats(&num_onodes);
+ }
+ for (auto c : buffer_cache_shards) {
+ c->add_stats(&num_extents, &num_blobs,
+ &num_buffers, &num_buffer_bytes);
}
logger->set(l_bluestore_onodes, num_onodes);
logger->set(l_bluestore_extents, num_extents);
RWLock::WLocker l(coll_lock);
Collection *c = new Collection(
this,
- cache_shards[cid.hash_to_shard(cache_shards.size())],
+ onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+ buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
cid);
new_coll_map[cid] = c;
_osr_attach(c);
void BlueStore::_flush_cache()
{
dout(10) << __func__ << dendl;
- for (auto i : cache_shards) {
+ for (auto i : onode_cache_shards) {
+ i->flush();
+ ceph_assert(i->empty());
+ }
+ for (auto i : buffer_cache_shards) {
i->flush();
ceph_assert(i->empty());
}
int BlueStore::flush_cache(ostream *os)
{
dout(10) << __func__ << dendl;
- for (auto i : cache_shards) {
+ for (auto i : onode_cache_shards) {
+ i->flush();
+ }
+ for (auto i : buffer_cache_shards) {
i->flush();
}
}
};
- struct Cache;
+ struct BufferCacheShard;
/// map logical extent range (object) onto buffers
struct BufferSpace {
ceph_assert(writing.empty());
}
- void _add_buffer(Cache* cache, Buffer *b, int level, Buffer *near) {
+ void _add_buffer(BufferCacheShard* cache, Buffer *b, int level, Buffer *near) {
cache->_audit("_add_buffer start");
buffer_map[b->offset].reset(b);
if (b->is_writing()) {
}
} else {
b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
- cache->_add_buffer(b, level, near);
+ cache->_add(b, level, near);
}
cache->_audit("_add_buffer end");
}
- void _rm_buffer(Cache* cache, Buffer *b) {
+ void _rm_buffer(BufferCacheShard* cache, Buffer *b) {
_rm_buffer(cache, buffer_map.find(b->offset));
}
- void _rm_buffer(Cache* cache,
+ void _rm_buffer(BufferCacheShard* cache,
map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
ceph_assert(p != buffer_map.end());
cache->_audit("_rm_buffer start");
if (p->second->is_writing()) {
writing.erase(writing.iterator_to(*p->second));
} else {
- cache->_rm_buffer(p->second.get());
+ cache->_rm(p->second.get());
}
buffer_map.erase(p);
cache->_audit("_rm_buffer end");
}
// must be called under protection of the Cache lock
- void _clear(Cache* cache);
+ void _clear(BufferCacheShard* cache);
// return value is the highest cache_private of a trimmed buffer, or 0.
- int discard(Cache* cache, uint32_t offset, uint32_t length) {
+ int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) {
std::lock_guard l(cache->lock);
- return _discard(cache, offset, length);
+ int ret = _discard(cache, offset, length);
+ cache->_trim();
+ return ret;
}
- int _discard(Cache* cache, uint32_t offset, uint32_t length);
+ int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length);
- void write(Cache* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
+ void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, bufferlist& bl,
unsigned flags) {
std::lock_guard l(cache->lock);
Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
flags);
b->cache_private = _discard(cache, offset, bl.length());
_add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
- cache->_trim_buffers();
+ cache->_trim();
}
- void _finish_write(Cache* cache, uint64_t seq);
- void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
+ void _finish_write(BufferCacheShard* cache, uint64_t seq);
+ void did_read(BufferCacheShard* cache, uint32_t offset, bufferlist& bl) {
std::lock_guard l(cache->lock);
Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
b->cache_private = _discard(cache, offset, bl.length());
_add_buffer(cache, b, 1, nullptr);
- cache->_trim_buffers();
+ cache->_trim();
}
- void read(Cache* cache, uint32_t offset, uint32_t length,
+ void read(BufferCacheShard* cache, uint32_t offset, uint32_t length,
BlueStore::ready_regions_t& res,
interval_set<uint32_t>& res_intervals,
int flags = 0);
- void truncate(Cache* cache, uint32_t offset) {
+ void truncate(BufferCacheShard* cache, uint32_t offset) {
discard(cache, offset, (uint32_t)-1 - offset);
}
- void split(Cache* cache, size_t pos, BufferSpace &r);
+ void split(BufferCacheShard* cache, size_t pos, BufferSpace &r);
- void dump(Cache* cache, Formatter *f) const {
+ void dump(BufferCacheShard* cache, Formatter *f) const {
std::lock_guard l(cache->lock);
f->open_array_section("buffers");
for (auto& i : buffer_map) {
friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
return l.get_sbid() == r.get_sbid();
}
- inline Cache* get_cache() {
+ inline BufferCacheShard* get_cache() {
return coll ? coll->cache : nullptr;
}
inline SharedBlobSet* get_parent() {
};
typedef boost::intrusive_ptr<Onode> OnodeRef;
-
- /// a cache (shard) of onodes and buffers
- struct Cache {
- CephContext* cct;
+ /// A generic Cache Shard
+ struct CacheShard {
+ CephContext *cct;
PerfCounters *logger;
/// protect lru and other structures
ceph::recursive_mutex lock = {
- ceph::make_recursive_mutex("BlueStore::Cache::lock") };
-
- std::atomic<uint64_t> num_extents = {0};
- std::atomic<uint64_t> num_blobs = {0};
- std::atomic<uint64_t> onode_max = {0};
- std::atomic<uint64_t> buffer_max = {0};
-
- std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes;
-
- static Cache *create(CephContext* cct, string type, PerfCounters *logger);
-
- Cache(CephContext* cct) : cct(cct), logger(nullptr) {}
- virtual ~Cache() {}
-
- virtual void _add_onode(OnodeRef& o, int level) = 0;
- virtual void _rm_onode(OnodeRef& o) = 0;
- virtual void _touch_onode(OnodeRef& o) = 0;
-
- virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0;
- virtual void _rm_buffer(Buffer *b) = 0;
- virtual void _move_buffer(Cache *src, Buffer *b) = 0;
- virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0;
- virtual void _touch_buffer(Buffer *b) = 0;
+ ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
- virtual uint64_t _get_num_onodes() = 0;
- virtual uint64_t _get_buffer_bytes() = 0;
+ std::atomic<uint64_t> max = {0};
+ std::atomic<uint64_t> num = {0};
- void add_extent() {
- ++num_extents;
- }
- void rm_extent() {
- --num_extents;
- }
-
- void add_blob() {
- ++num_blobs;
- }
- void rm_blob() {
- --num_blobs;
- }
+ CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
+ virtual ~CacheShard() {}
- void set_onode_max(uint64_t max) {
- onode_max = max;
+ void set_max(uint64_t max_) {
+ max = max_;
}
- void set_buffer_max(uint64_t max) {
- buffer_max = max;
+ uint64_t _get_num() {
+ return num;
}
- void flush();
- void trim_onodes();
- void trim_buffers();
-
- virtual void _trim_onodes_to(uint64_t max) = 0;
- virtual void _trim_buffers_to(uint64_t max) = 0;
-
- void _trim_onodes() {
- _trim_onodes_to(onode_max);
+ virtual void _trim_to(uint64_t max) = 0;
+ void _trim() {
+ _trim_to(max);
}
-
- void _trim_buffers() {
- _trim_buffers_to(buffer_max);
+ void trim() {
+ std::lock_guard l(lock);
+ _trim();
}
-
- virtual void add_stats(uint64_t *onodes, uint64_t *extents,
- uint64_t *blobs,
- uint64_t *buffers,
- uint64_t *bytes) = 0;
-
- bool empty() {
+ void flush() {
std::lock_guard l(lock);
- return _get_num_onodes() == 0 && _get_buffer_bytes() == 0;
+ _trim_to(0);
}
#ifdef DEBUG_CACHE
#endif
};
- /// simple LRU cache for onodes and buffers
- struct LRUCache : public Cache {
- private:
- typedef boost::intrusive::list<
- Onode,
- boost::intrusive::member_hook<
- Onode,
- boost::intrusive::list_member_hook<>,
- &Onode::lru_item> > onode_lru_list_t;
- typedef boost::intrusive::list<
- Buffer,
- boost::intrusive::member_hook<
- Buffer,
- boost::intrusive::list_member_hook<>,
- &Buffer::lru_item> > buffer_lru_list_t;
-
- onode_lru_list_t onode_lru;
-
- buffer_lru_list_t buffer_lru;
- uint64_t buffer_size = 0;
-
+ /// A Generic onode Cache Shard
+ struct OnodeCacheShard : public CacheShard {
+ std::array<std::pair<ghobject_t, mono_clock::time_point>, 64> dumped_onodes;
public:
- LRUCache(CephContext* cct) : Cache(cct) {}
- uint64_t _get_num_onodes() override {
- return onode_lru.size();
- }
- void _add_onode(OnodeRef& o, int level) override {
- if (level > 0)
- onode_lru.push_front(*o);
- else
- onode_lru.push_back(*o);
- }
- void _rm_onode(OnodeRef& o) override {
- auto q = onode_lru.iterator_to(*o);
- onode_lru.erase(q);
- }
- void _touch_onode(OnodeRef& o) override;
-
- uint64_t _get_buffer_bytes() override {
- return buffer_size;
- }
- void _add_buffer(Buffer *b, int level, Buffer *near) override {
- if (near) {
- auto q = buffer_lru.iterator_to(*near);
- buffer_lru.insert(q, *b);
- } else if (level > 0) {
- buffer_lru.push_front(*b);
- } else {
- buffer_lru.push_back(*b);
- }
- buffer_size += b->length;
- }
- void _rm_buffer(Buffer *b) override {
- ceph_assert(buffer_size >= b->length);
- buffer_size -= b->length;
- auto q = buffer_lru.iterator_to(*b);
- buffer_lru.erase(q);
- }
- void _move_buffer(Cache *src, Buffer *b) override {
- src->_rm_buffer(b);
- _add_buffer(b, 0, nullptr);
- }
- void _adjust_buffer_size(Buffer *b, int64_t delta) override {
- ceph_assert((int64_t)buffer_size + delta >= 0);
- buffer_size += delta;
- }
- void _touch_buffer(Buffer *b) override {
- auto p = buffer_lru.iterator_to(*b);
- buffer_lru.erase(p);
- buffer_lru.push_front(*b);
- _audit("_touch_buffer end");
- }
-
- void _trim_onodes_to(uint64_t max) override;
- void _trim_buffers_to(uint64_t max) override;
+ OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
+ static OnodeCacheShard *create(CephContext* cct, string type,
+ PerfCounters *logger);
+ virtual void _add(OnodeRef& o, int level) = 0;
+ virtual void _rm(OnodeRef& o) = 0;
+ virtual void _touch(OnodeRef& o) = 0;
+ virtual void add_stats(uint64_t *onodes) = 0;
- void add_stats(uint64_t *onodes, uint64_t *extents,
- uint64_t *blobs,
- uint64_t *buffers,
- uint64_t *bytes) override {
- std::lock_guard l(lock);
- *onodes += onode_lru.size();
- *extents += num_extents;
- *blobs += num_blobs;
- *buffers += buffer_lru.size();
- *bytes += buffer_size;
+ bool empty() {
+ return _get_num() == 0;
}
-
-#ifdef DEBUG_CACHE
- void _audit(const char *s) override;
-#endif
};
- // 2Q cache for buffers, LRU for onodes
- struct TwoQCache : public Cache {
- private:
- // stick with LRU for onodes for now (fixme?)
- typedef boost::intrusive::list<
- Onode,
- boost::intrusive::member_hook<
- Onode,
- boost::intrusive::list_member_hook<>,
- &Onode::lru_item> > onode_lru_list_t;
- typedef boost::intrusive::list<
- Buffer,
- boost::intrusive::member_hook<
- Buffer,
- boost::intrusive::list_member_hook<>,
- &Buffer::lru_item> > buffer_list_t;
-
- onode_lru_list_t onode_lru;
-
- buffer_list_t buffer_hot; ///< "Am" hot buffers
- buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
- buffer_list_t buffer_warm_out; ///< "A1out" empty buffers we've evicted
- uint64_t buffer_bytes = 0; ///< bytes
-
- enum {
- BUFFER_NEW = 0,
- BUFFER_WARM_IN, ///< in buffer_warm_in
- BUFFER_WARM_OUT, ///< in buffer_warm_out
- BUFFER_HOT, ///< in buffer_hot
- BUFFER_TYPE_MAX
- };
-
- uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+ /// A Generic buffer Cache Shard
+ struct BufferCacheShard : public CacheShard {
+ std::atomic<uint64_t> num_extents = {0};
+ std::atomic<uint64_t> num_blobs = {0};
+ uint64_t buffer_bytes = 0;
public:
- TwoQCache(CephContext* cct) : Cache(cct) {}
- uint64_t _get_num_onodes() override {
- return onode_lru.size();
+ BufferCacheShard(CephContext* cct) : CacheShard(cct) {}
+ static BufferCacheShard *create(CephContext* cct, string type,
+ PerfCounters *logger);
+ virtual void _add(Buffer *b, int level, Buffer *near) = 0;
+ virtual void _rm(Buffer *b) = 0;
+ virtual void _move(BufferCacheShard *src, Buffer *b) = 0;
+ virtual void _touch(Buffer *b) = 0;
+ virtual void _adjust_size(Buffer *b, int64_t delta) = 0;
+
+ uint64_t _get_bytes() {
+ return buffer_bytes;
}
- void _add_onode(OnodeRef& o, int level) override {
- if (level > 0)
- onode_lru.push_front(*o);
- else
- onode_lru.push_back(*o);
+
+ void add_extent() {
+ ++num_extents;
}
- void _rm_onode(OnodeRef& o) override {
- auto q = onode_lru.iterator_to(*o);
- onode_lru.erase(q);
+ void rm_extent() {
+ --num_extents;
}
- void _touch_onode(OnodeRef& o) override;
- uint64_t _get_buffer_bytes() override {
- return buffer_bytes;
+ void add_blob() {
+ ++num_blobs;
}
- void _add_buffer(Buffer *b, int level, Buffer *near) override;
- void _rm_buffer(Buffer *b) override;
- void _move_buffer(Cache *src, Buffer *b) override;
- void _adjust_buffer_size(Buffer *b, int64_t delta) override;
- void _touch_buffer(Buffer *b) override {
- switch (b->cache_private) {
- case BUFFER_WARM_IN:
- // do nothing (somewhat counter-intuitively!)
- break;
- case BUFFER_WARM_OUT:
- // move from warm_out to hot LRU
- ceph_abort_msg("this happens via discard hint");
- break;
- case BUFFER_HOT:
- // move to front of hot LRU
- buffer_hot.erase(buffer_hot.iterator_to(*b));
- buffer_hot.push_front(*b);
- break;
- }
- _audit("_touch_buffer end");
+ void rm_blob() {
+ --num_blobs;
}
- void _trim_onodes_to(uint64_t max) override;
- void _trim_buffers_to(uint64_t max) override;
+ virtual void add_stats(uint64_t *extents,
+ uint64_t *blobs,
+ uint64_t *buffers,
+ uint64_t *bytes) = 0;
- void add_stats(uint64_t *onodes, uint64_t *extents,
- uint64_t *blobs,
- uint64_t *buffers,
- uint64_t *bytes) override {
+ bool empty() {
std::lock_guard l(lock);
- *onodes += onode_lru.size();
- *extents += num_extents;
- *blobs += num_blobs;
- *buffers += buffer_hot.size() + buffer_warm_in.size();
- *bytes += buffer_bytes;
+ return _get_bytes() == 0;
}
-
-#ifdef DEBUG_CACHE
- void _audit(const char *s) override;
-#endif
};
struct OnodeSpace {
- private:
- Cache *cache;
+ OnodeCacheShard *cache;
+ private:
/// forward lookups
mempool::bluestore_cache_other::unordered_map<ghobject_t,OnodeRef> onode_map;
friend class Collection; // for split_cache()
public:
- OnodeSpace(Cache *c) : cache(c) {}
+ OnodeSpace(OnodeCacheShard *c) : cache(c) {}
~OnodeSpace() {
clear();
}
struct Collection : public CollectionImpl {
BlueStore *store;
OpSequencerRef osr;
- Cache *cache; ///< our cache shard
+ BufferCacheShard *cache; ///< our cache shard
bluestore_cnode_t cnode;
RWLock lock;
void flush() override;
void flush_all_but_last();
- Collection(BlueStore *ns, Cache *ca, coll_t c);
+ Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c);
};
class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
map<coll_t,CollectionRef> new_coll_map;
- vector<Cache*> cache_shards;
+ vector<OnodeCacheShard*> onode_cache_shards;
+ vector<BufferCacheShard*> buffer_cache_shards;
/// protect zombie_osr_set
ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
virtual uint64_t _get_used_bytes() const {
uint64_t bytes = 0;
- for (auto i : store->cache_shards) {
- bytes += i->_get_buffer_bytes();
+ for (auto i : store->buffer_cache_shards) {
+ bytes += i->_get_bytes();
}
return bytes;
}
void set_cache_shards(unsigned num) override;
void dump_cache_stats(Formatter *f) override {
int onode_count = 0, buffers_bytes = 0;
- for (auto i: cache_shards) {
- onode_count += i->_get_num_onodes();
- buffers_bytes += i->_get_buffer_bytes();
+ for (auto i: onode_cache_shards) {
+ onode_count += i->_get_num();
+ }
+ for (auto i: buffer_cache_shards) {
+ buffers_bytes += i->_get_bytes();
}
f->dump_int("bluestore_onode", onode_count);
f->dump_int("bluestore_buffers", buffers_bytes);
}
void dump_cache_stats(ostream& ss) override {
int onode_count = 0, buffers_bytes = 0;
- for (auto i: cache_shards) {
- onode_count += i->_get_num_onodes();
- buffers_bytes += i->_get_buffer_bytes();
+ for (auto i: onode_cache_shards) {
+ onode_count += i->_get_num();
+ }
+ for (auto i: buffer_cache_shards) {
+ buffers_bytes += i->_get_bytes();
}
ss << "bluestore_onode: " << onode_count;
ss << "bluestore_buffers: " << buffers_bytes;
{
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::Cache *cache = BlueStore::Cache::create(
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
g_ceph_context, "lru", NULL);
- BlueStore::Collection coll(&store, cache, coll_t());
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ BlueStore::Collection coll(&store, oc, bc, coll_t());
BlueStore::Blob b;
b.shared_blob = new BlueStore::SharedBlob(nullptr);
b.shared_blob->get(); // hack to avoid dtor from running
unsigned mas = 4096;
BlueStore store(g_ceph_context, "", 8192);
- BlueStore::Cache *cache = BlueStore::Cache::create(
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
g_ceph_context, "lru", NULL);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t()));
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
{
BlueStore::Blob B;
}
{
BlueStore store(g_ceph_context, "", 0x4000);
- BlueStore::Cache *cache = BlueStore::Cache::create(
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
g_ceph_context, "lru", NULL);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t()));
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Blob B;
B.shared_blob = new BlueStore::SharedBlob(nullptr);
B.shared_blob->get(); // hack to avoid dtor from running
TEST(Blob, split)
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::Cache *cache = BlueStore::Cache::create(
- g_ceph_context, "lru", NULL);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t()));
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
{
BlueStore::Blob L, R;
L.shared_blob = new BlueStore::SharedBlob(coll.get());
TEST(Blob, legacy_decode)
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::Cache *cache = BlueStore::Cache::create(
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
g_ceph_context, "lru", NULL);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, cache, coll_t()));
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
bufferlist bl, bl2;
{
BlueStore::Blob B;
TEST(ExtentMap, seek_lextent)
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::LRUCache cache(g_ceph_context);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);
BlueStore::BlobRef br(new BlueStore::Blob);
TEST(ExtentMap, has_any_lextents)
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::LRUCache cache(g_ceph_context);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);
BlueStore::BlobRef b(new BlueStore::Blob);
TEST(ExtentMap, compress_extent_map)
{
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::LRUCache cache(g_ceph_context);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
+BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);
BlueStore::BlobRef b1(new BlueStore::Blob);
TEST(GarbageCollector, BasicTest)
{
- BlueStore::LRUCache cache(g_ceph_context);
+ BlueStore::OnodeCacheShard *oc = BlueStore::OnodeCacheShard::create(
+ g_ceph_context, "lru", NULL);
+ BlueStore::BufferCacheShard *bc = BlueStore::BufferCacheShard::create(
+ g_ceph_context, "lru", NULL);
+
BlueStore store(g_ceph_context, "", 4096);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);
*/
{
BlueStore store(g_ceph_context, "", 0x10000);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);
*/
{
BlueStore store(g_ceph_context, "", 0x10000);
- BlueStore::CollectionRef coll(new BlueStore::Collection(&store, &cache, coll_t()));
+ BlueStore::CollectionRef coll(new BlueStore::Collection(&store, oc, bc, coll_t()));
BlueStore::Onode onode(coll.get(), ghobject_t(), "");
BlueStore::ExtentMap em(&onode);