From d2e9343b0effd467a3f258ce9b87f4edc279f616 Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Wed, 14 Dec 2022 11:27:33 +0000 Subject: [PATCH] os/bluestore: set rocksdb iterator bounds for Bluestore::_collection_list() When Bluestore::_collection_list() is called during PG removal, rocksdb often has to iterate over many rocksdb deletion tombstones from recenently deleted onode keys. This change bounds the rocksdb iteration when possible, to avoid high latencies caused by iteration over many contiguous deletion tombstones. Fixes: https://tracker.ceph.com/issues/58274 Signed-off-by: Cory Snyder (cherry picked from commit a6bcb577c6626afa226ac63f068cb7287c3042b4) --- src/os/bluestore/BlueStore.cc | 117 +++++++++++++--------------------- 1 file changed, 46 insertions(+), 71 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 046f820b53367..73c767a49c724 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -11787,7 +11787,6 @@ int BlueStore::_collection_list( Collection *c, const ghobject_t& start, const ghobject_t& end, int max, bool legacy, vector *ls, ghobject_t *pnext) { - if (!c->exists) return -ENOENT; @@ -11795,8 +11794,7 @@ int BlueStore::_collection_list( std::unique_ptr it; ghobject_t coll_range_temp_start, coll_range_temp_end; ghobject_t coll_range_start, coll_range_end; - ghobject_t pend; - bool temp; + std::vector> ranges; if (!pnext) pnext = &static_next; @@ -11830,82 +11828,59 @@ int BlueStore::_collection_list( << " and " << coll_range_start << " to " << coll_range_end << " start " << start << dendl; - if (legacy) { - it = std::make_unique( - cct, db->get_iterator(PREFIX_OBJ)); - } else { - it = std::make_unique( - db->get_iterator(PREFIX_OBJ)); - } - if (start == ghobject_t() || - start.hobj == hobject_t() || - start == c->cid.get_min_hobj()) { - it->upper_bound(coll_range_temp_start); - temp = true; - } else { - if (start.hobj.is_temp()) { - temp = true; - ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); - } else { - temp = false; - ceph_assert(start >= coll_range_start && start < coll_range_end); - } - dout(20) << __func__ << " temp=" << (int)temp << dendl; - it->lower_bound(start); + + // if specified start is not specifically in the pg normal range, we should start with temp iter + if ((start == ghobject_t() || + start.hobj == hobject_t() || + start == c->cid.get_min_hobj() || + start.hobj.is_temp()) + && coll_range_temp_start != coll_range_temp_end) { + ranges.push_back(std::tuple(std::move(coll_range_temp_start), std::move(coll_range_temp_end))); } - if (end.hobj.is_max()) { - pend = temp ? coll_range_temp_end : coll_range_end; - } else { - if (end.hobj.is_temp()) { - if (temp) { - pend = end; - } else { - *pnext = ghobject_t::get_max(); - return 0; - } + // if end param is in temp section, then we do not need to proceed to the normal section + if (!end.hobj.is_temp()) { + ranges.push_back(std::tuple(std::move(coll_range_start), std::move(coll_range_end))); + } + + for (const auto & [cur_range_start, cur_range_end] : ranges) { + dout(30) << __func__ << " cur_range " << cur_range_start << " to " << cur_range_end << dendl; + + const ghobject_t low = start > cur_range_start ? start : cur_range_start; + const ghobject_t high = end < cur_range_end ? end : cur_range_end; + if (low >= high) { + continue; + } + + std::string kv_low_key, kv_high_key; + _key_encode_prefix(low, &kv_low_key); + _key_encode_prefix(high, &kv_high_key); + kv_high_key.push_back('\xff'); + dout(30) << __func__ << " kv_low_key: " << kv_low_key << " kv_high_key: " << kv_high_key << dendl; + const KeyValueDB::IteratorBounds bounds = KeyValueDB::IteratorBounds{std::move(kv_low_key), std::move(kv_high_key)}; + if (legacy) { + it = std::make_unique( + cct, db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } else { - pend = temp ? coll_range_temp_end : end; + it = std::make_unique( + db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } - } - dout(20) << __func__ << " pend " << pend << dendl; - while (true) { - if (!it->valid() || it->is_ge(pend)) { - if (!it->valid()) - dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; - else - dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; - if (temp) { - if (end.hobj.is_temp()) { - if (it->valid() && it->is_lt(coll_range_temp_end)) { - *pnext = it->oid(); - return 0; - } - break; - } - dout(30) << __func__ << " switch to non-temp namespace" << dendl; - temp = false; - it->upper_bound(coll_range_start); - if (end.hobj.is_max()) - pend = coll_range_end; - else - pend = end; - dout(30) << __func__ << " pend " << pend << dendl; - continue; + it->lower_bound(low); + while (it->valid()) { + if (it->oid() < low) { + it->next(); + continue; + } + if (it->oid() > high) { + break; } - if (it->valid() && it->is_lt(coll_range_end)) { + if (ls->size() >= (unsigned)max || it->oid() == high) { *pnext = it->oid(); return 0; } - break; - } - dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; - if (ls->size() >= (unsigned)max) { - dout(20) << __func__ << " reached max " << max << dendl; - *pnext = it->oid(); - return 0; + dout(20) << __func__ << " oid " << it->oid() << dendl; + ls->push_back(it->oid()); + it->next(); } - ls->push_back(it->oid()); - it->next(); } *pnext = ghobject_t::get_max(); return 0; -- 2.39.5