One might face lack of cache trimming when there is a bunch of pinned entries on the top of Onode's cache LRU list. If these pinned entries stay in the state for a long time cache might start using too much memory causing OSD to go out of osd-memory-target limit. Pinned state tend to happen to osdmap onodes.
The proposed patch preserves last trim position in the LRU list (if it pointed to a pinned entry) and proceeds trimming from that position if it wasn't invalidated. LRU nature of the list enables to do that safely since no new entries appear above the previously present entry while it's not touched.
Fixes: https://tracker.ceph.com/issues/48729
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
OPTION(bluestore_debug_misc, OPT_BOOL)
OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL)
OPTION(bluestore_debug_small_allocations, OPT_INT)
+OPTION(bluestore_debug_max_cached_onodes, OPT_INT)
OPTION(bluestore_debug_too_many_blobs_threshold, OPT_INT)
OPTION(bluestore_debug_freelist, OPT_BOOL)
OPTION(bluestore_debug_prefill, OPT_FLOAT)
Option("bluestore_debug_small_allocations", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description(""),
-
+ Option("bluestore_debug_max_cached_onodes", Option::TYPE_INT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description("This allows to explicitly cap number of onode entries per cache shard "
+ "effectively bypassing all the smart but indirect cache adjustment logic."
+ " Intended for testing purposes only "),
Option("bluestore_debug_too_many_blobs_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(24*1024)
.set_description(""),
}
// onodes
- if (onode_max >= onode_lru.size()) {
+ if (onode_max >= onode_lru.size() ||
+ last_pinned == onode_lru.begin()) {
return; // don't even try
}
uint64_t num = onode_lru.size() - onode_max;
- auto p = onode_lru.end();
+ auto p = last_pinned;
+ last_pinned = onode_lru.end();
ceph_assert(p != onode_lru.begin());
--p;
int skipped = 0;
dout(20) << __func__ << " " << o->oid << " has " << refs
<< " refs, skipping" << dendl;
if (++skipped >= max_skipped) {
- dout(20) << __func__ << " maximum skip pinned reached; stopping with "
+ dout(15) << __func__ << " maximum skip pinned reached; stopping with "
<< num << " left to trim" << dendl;
+ last_pinned = p;
break;
}
}
dout(30) << __func__ << " rm " << o->oid << dendl;
if (p != onode_lru.begin()) {
- onode_lru.erase(p--);
+ _onode_lru_erase(p--);
} else {
- onode_lru.erase(p);
- ceph_assert(num == 1);
+ _onode_lru_erase(p);
+ num = 1; // fake num to end the loop
+ // in we might still have some pinned onodes
}
o->get(); // paranoia
o->c->onode_map.remove(o->oid);
void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
{
auto p = onode_lru.iterator_to(*o);
- onode_lru.erase(p);
+ _onode_lru_erase(p);
onode_lru.push_front(*o);
}
}
// onodes
- if (onode_max >= onode_lru.size()) {
+ if (onode_max >= onode_lru.size() ||
+ last_pinned == onode_lru.begin()) {
return; // don't even try
}
uint64_t num = onode_lru.size() - onode_max;
- auto p = onode_lru.end();
+ auto p = last_pinned;
+ last_pinned = onode_lru.end();
ceph_assert(p != onode_lru.begin());
--p;
int skipped = 0;
dout(20) << __func__ << " " << o->oid << " has " << refs
<< " refs; skipping" << dendl;
if (++skipped >= max_skipped) {
- dout(20) << __func__ << " maximum skip pinned reached; stopping with "
+ dout(15) << __func__ << " maximum skip pinned reached; stopping with "
<< num << " left to trim" << dendl;
+ last_pinned = p;
break;
}
}
dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
if (p != onode_lru.begin()) {
- onode_lru.erase(p--);
+ _onode_lru_erase(p--);
} else {
- onode_lru.erase(p);
- ceph_assert(num == 1);
+ _onode_lru_erase(p);
+ num = 1; // fake num to end the loop
+ // in we might still have some pinned onodes
}
o->get(); // paranoia
o->c->onode_map.remove(o->oid);
(meta_alloc / (double) num_shards) / meta_cache->get_bytes_per_onode());
uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / num_shards);
+ auto debug_max_onodes = g_conf()->bluestore_debug_max_cached_onodes;
+ if (debug_max_onodes) {
+ max_shard_onodes = debug_max_onodes;
+ }
ldout(cct, 30) << __func__ << " max_shard_onodes: " << max_shard_onodes
<< " max_shard_buffer: " << max_shard_buffer << dendl;
&Buffer::lru_item> > buffer_lru_list_t;
onode_lru_list_t onode_lru;
+ onode_lru_list_t::iterator last_pinned;
buffer_lru_list_t buffer_lru;
uint64_t buffer_size = 0;
+ void _onode_lru_erase(onode_lru_list_t::iterator it) {
+ if (it == last_pinned) {
+ last_pinned = onode_lru.end();
+ }
+ onode_lru.erase(it);
+ }
+
public:
- LRUCache(CephContext* cct) : Cache(cct) {}
+ LRUCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){}
uint64_t _get_num_onodes() override {
return onode_lru.size();
}
}
void _rm_onode(OnodeRef& o) override {
auto q = onode_lru.iterator_to(*o);
- onode_lru.erase(q);
+ _onode_lru_erase(q);
}
void _touch_onode(OnodeRef& o) override;
&Buffer::lru_item> > buffer_list_t;
onode_lru_list_t onode_lru;
+ onode_lru_list_t::iterator last_pinned;
buffer_list_t buffer_hot; ///< "Am" hot buffers
buffer_list_t buffer_warm_in; ///< "A1in" newly warm buffers
uint64_t buffer_list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+ void _onode_lru_erase(onode_lru_list_t::iterator it) {
+ if (it == last_pinned) {
+ last_pinned = onode_lru.end();
+ }
+ onode_lru.erase(it);
+ }
public:
- TwoQCache(CephContext* cct) : Cache(cct) {}
+ TwoQCache(CephContext* cct) : Cache(cct), last_pinned(onode_lru.end()){}
uint64_t _get_num_onodes() override {
return onode_lru.size();
}
}
void _rm_onode(OnodeRef& o) override {
auto q = onode_lru.iterator_to(*o);
- onode_lru.erase(q);
+ _onode_lru_erase(q);
}
void _touch_onode(OnodeRef& o) override;
#include <glob.h>
#include <stdio.h>
#include <string.h>
+#include <set>
#include <iostream>
#include <time.h>
#include <sys/mount.h>
void doSyntheticTest(
int num_ops,
uint64_t max_obj, uint64_t max_wr, uint64_t align);
+ void doOnodeCacheTrimTest();
};
class StoreTestDeferredSetup : public StoreTest {
);
}
+void StoreTest::doOnodeCacheTrimTest() {
+ int r;
+ coll_t cid(spg_t(pg_t(0, 1), shard_id_t(1)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ vector<ghobject_t> all;
+ const size_t max_onodes = 2000;
+ const size_t max_pinned_onodes = 200;
+ const size_t max_cached_onodes = max_pinned_onodes / 2;
+ const PerfCounters* logger = store->get_perf_counters();
+ size_t onodes;
+ {
+ ObjectStore::Transaction t;
+ for (size_t i = 0; i < max_onodes; ++i) {
+ string name("object_");
+ name += stringify(i);
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP)),
+ ghobject_t::NO_GEN, shard_id_t(1));
+ hoid.hobj.pool = 1;
+ all.emplace_back(hoid);
+ t.touch(cid, hoid);
+ if ((i % 100) == 0) {
+ cerr << "Creating object " << hoid << std::endl;
+ }
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ for (size_t i = 0; i < 5; ++i) {
+ onodes = logger->get(l_bluestore_onodes);
+ if (onodes == max_onodes)
+ break;
+ sleep(1);
+ }
+ ceph_assert(onodes == max_onodes);
+
+ SetVal(g_conf(), "bluestore_debug_max_cached_onodes",
+ stringify(max_cached_onodes).c_str());
+
+ for (size_t i = 0; i < 5; ++i) {
+ cerr << " remaining onodes = "
+ << logger->get(l_bluestore_onodes)
+ << std::endl;
+ sleep(1);
+ }
+ onodes = logger->get(l_bluestore_onodes);
+ ceph_assert(onodes == max_cached_onodes);
+
+
+ // revert cache size cap
+ SetVal(g_conf(), "bluestore_debug_max_cached_onodes", "0");
+
+ // pin some onodes
+ vector <ObjectMap::ObjectMapIterator> omap_iterators;
+ for (size_t i = 0; i < max_pinned_onodes; ++i) {
+ omap_iterators.emplace_back(store->get_omap_iterator(ch, all[i]));
+ }
+ // "warm" non-pinned onodes
+ {
+ ObjectStore::Transaction t;
+ for (size_t i = max_pinned_onodes; i < max_onodes; ++i) {
+ t.touch(cid, all[i]);
+ }
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ for (size_t i = 0; i < 5; ++i) {
+ onodes = logger->get(l_bluestore_onodes);
+ if (onodes == max_onodes)
+ break;
+ sleep(1);
+ }
+ ceph_assert(onodes == max_onodes);
+
+ SetVal(g_conf(), "bluestore_debug_max_cached_onodes",
+ stringify(max_cached_onodes).c_str());
+
+ for (size_t i = 0; i < 5; ++i) {
+ cerr << " remaining onodes = "
+ << logger->get(l_bluestore_onodes)
+ << std::endl;
+ sleep(1);
+ }
+ onodes = logger->get(l_bluestore_onodes);
+ ceph_assert(onodes == max_pinned_onodes);
+
+ // unpin onodes
+ omap_iterators.resize(0);
+
+ for (size_t i = 0; i < 5; ++i) {
+ cerr << " remaining onodes = "
+ << logger->get(l_bluestore_onodes)
+ << std::endl;
+ sleep(1);
+ }
+ onodes = logger->get(l_bluestore_onodes);
+ ceph_assert(onodes == max_cached_onodes);
+
+ {
+ ObjectStore::Transaction t;
+ for (size_t i = 0; i < max_onodes; ++i)
+ t.remove(cid, all[i]);
+ t.remove_collection(cid);
+ cerr << "Cleaning" << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTestSpecificAUSize, OnodeCacheTrim2QTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ SetVal(g_conf(), "bluestore_cache_type", "2q");
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doOnodeCacheTrimTest();
+}
+
+TEST_P(StoreTestSpecificAUSize, OnodeCacheTrimLRUTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ SetVal(g_conf(), "bluestore_cache_type", "lru");
+ g_conf().apply_changes(nullptr);
+
+ StartDeferred(65536);
+ doOnodeCacheTrimTest();
+}
+
#endif // WITH_BLUESTORE
int main(int argc, char **argv) {