From a8c44cb03bfe0ac507b98b3b627d9320664a5ce6 Mon Sep 17 00:00:00 2001 From: Cory Snyder Date: Wed, 14 Dec 2022 11:27:33 +0000 Subject: [PATCH] os/bluestore: set rocksdb iterator bounds for Bluestore::_collection_list() When Bluestore::_collection_list() is called during PG removal, rocksdb often has to iterate over many rocksdb deletion tombstones from recenently deleted onode keys. This change bounds the rocksdb iteration when possible, to avoid high latencies caused by iteration over many contiguous deletion tombstones. Fixes: https://tracker.ceph.com/issues/58274 Signed-off-by: Cory Snyder (cherry picked from commit a6bcb577c6626afa226ac63f068cb7287c3042b4) --- src/os/bluestore/BlueStore.cc | 117 +++++++++++++--------------------- 1 file changed, 46 insertions(+), 71 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index dda2b0b1e63..4439be5c7c0 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -12532,7 +12532,6 @@ int BlueStore::_collection_list( Collection *c, const ghobject_t& start, const ghobject_t& end, int max, bool legacy, vector *ls, ghobject_t *pnext) { - if (!c->exists) return -ENOENT; @@ -12540,8 +12539,7 @@ int BlueStore::_collection_list( std::unique_ptr it; ghobject_t coll_range_temp_start, coll_range_temp_end; ghobject_t coll_range_start, coll_range_end; - ghobject_t pend; - bool temp; + std::vector> ranges; if (!pnext) pnext = &static_next; @@ -12575,82 +12573,59 @@ int BlueStore::_collection_list( << " and " << coll_range_start << " to " << coll_range_end << " start " << start << dendl; - if (legacy) { - it = std::make_unique( - cct, db->get_iterator(PREFIX_OBJ)); - } else { - it = std::make_unique( - db->get_iterator(PREFIX_OBJ)); - } - if (start == ghobject_t() || - start.hobj == hobject_t() || - start == c->cid.get_min_hobj()) { - it->upper_bound(coll_range_temp_start); - temp = true; - } else { - if (start.hobj.is_temp()) { - temp = true; - ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); - } else { - temp = false; - ceph_assert(start >= coll_range_start && start < coll_range_end); - } - dout(20) << __func__ << " temp=" << (int)temp << dendl; - it->lower_bound(start); + + // if specified start is not specifically in the pg normal range, we should start with temp iter + if ((start == ghobject_t() || + start.hobj == hobject_t() || + start == c->cid.get_min_hobj() || + start.hobj.is_temp()) + && coll_range_temp_start != coll_range_temp_end) { + ranges.push_back(std::tuple(std::move(coll_range_temp_start), std::move(coll_range_temp_end))); } - if (end.hobj.is_max()) { - pend = temp ? coll_range_temp_end : coll_range_end; - } else { - if (end.hobj.is_temp()) { - if (temp) { - pend = end; - } else { - *pnext = ghobject_t::get_max(); - return 0; - } + // if end param is in temp section, then we do not need to proceed to the normal section + if (!end.hobj.is_temp()) { + ranges.push_back(std::tuple(std::move(coll_range_start), std::move(coll_range_end))); + } + + for (const auto & [cur_range_start, cur_range_end] : ranges) { + dout(30) << __func__ << " cur_range " << cur_range_start << " to " << cur_range_end << dendl; + + const ghobject_t low = start > cur_range_start ? start : cur_range_start; + const ghobject_t high = end < cur_range_end ? end : cur_range_end; + if (low >= high) { + continue; + } + + std::string kv_low_key, kv_high_key; + _key_encode_prefix(low, &kv_low_key); + _key_encode_prefix(high, &kv_high_key); + kv_high_key.push_back('\xff'); + dout(30) << __func__ << " kv_low_key: " << kv_low_key << " kv_high_key: " << kv_high_key << dendl; + const KeyValueDB::IteratorBounds bounds = KeyValueDB::IteratorBounds{std::move(kv_low_key), std::move(kv_high_key)}; + if (legacy) { + it = std::make_unique( + cct, db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } else { - pend = temp ? coll_range_temp_end : end; + it = std::make_unique( + db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } - } - dout(20) << __func__ << " pend " << pend << dendl; - while (true) { - if (!it->valid() || it->is_ge(pend)) { - if (!it->valid()) - dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; - else - dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; - if (temp) { - if (end.hobj.is_temp()) { - if (it->valid() && it->is_lt(coll_range_temp_end)) { - *pnext = it->oid(); - return 0; - } - break; - } - dout(30) << __func__ << " switch to non-temp namespace" << dendl; - temp = false; - it->upper_bound(coll_range_start); - if (end.hobj.is_max()) - pend = coll_range_end; - else - pend = end; - dout(30) << __func__ << " pend " << pend << dendl; - continue; + it->lower_bound(low); + while (it->valid()) { + if (it->oid() < low) { + it->next(); + continue; + } + if (it->oid() > high) { + break; } - if (it->valid() && it->is_lt(coll_range_end)) { + if (ls->size() >= (unsigned)max || it->oid() == high) { *pnext = it->oid(); return 0; } - break; - } - dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; - if (ls->size() >= (unsigned)max) { - dout(20) << __func__ << " reached max " << max << dendl; - *pnext = it->oid(); - return 0; + dout(20) << __func__ << " oid " << it->oid() << dendl; + ls->push_back(it->oid()); + it->next(); } - ls->push_back(it->oid()); - it->next(); } *pnext = ghobject_t::get_max(); return 0; -- 2.39.5