From: Cory Snyder Date: Wed, 14 Dec 2022 11:27:33 +0000 (+0000) Subject: os/bluestore: set rocksdb iterator bounds for Bluestore::_collection_list() X-Git-Tag: v19.1.1~142^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=a8c44cb03bfe0ac507b98b3b627d9320664a5ce6;p=ceph.git os/bluestore: set rocksdb iterator bounds for Bluestore::_collection_list() When Bluestore::_collection_list() is called during PG removal, rocksdb often has to iterate over many rocksdb deletion tombstones from recenently deleted onode keys. This change bounds the rocksdb iteration when possible, to avoid high latencies caused by iteration over many contiguous deletion tombstones. Fixes: https://tracker.ceph.com/issues/58274 Signed-off-by: Cory Snyder (cherry picked from commit a6bcb577c6626afa226ac63f068cb7287c3042b4) --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index dda2b0b1e63a7..4439be5c7c05e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -12532,7 +12532,6 @@ int BlueStore::_collection_list( Collection *c, const ghobject_t& start, const ghobject_t& end, int max, bool legacy, vector *ls, ghobject_t *pnext) { - if (!c->exists) return -ENOENT; @@ -12540,8 +12539,7 @@ int BlueStore::_collection_list( std::unique_ptr it; ghobject_t coll_range_temp_start, coll_range_temp_end; ghobject_t coll_range_start, coll_range_end; - ghobject_t pend; - bool temp; + std::vector> ranges; if (!pnext) pnext = &static_next; @@ -12575,82 +12573,59 @@ int BlueStore::_collection_list( << " and " << coll_range_start << " to " << coll_range_end << " start " << start << dendl; - if (legacy) { - it = std::make_unique( - cct, db->get_iterator(PREFIX_OBJ)); - } else { - it = std::make_unique( - db->get_iterator(PREFIX_OBJ)); - } - if (start == ghobject_t() || - start.hobj == hobject_t() || - start == c->cid.get_min_hobj()) { - it->upper_bound(coll_range_temp_start); - temp = true; - } else { - if (start.hobj.is_temp()) { - temp = true; - ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end); - } else { - temp = false; - ceph_assert(start >= coll_range_start && start < coll_range_end); - } - dout(20) << __func__ << " temp=" << (int)temp << dendl; - it->lower_bound(start); + + // if specified start is not specifically in the pg normal range, we should start with temp iter + if ((start == ghobject_t() || + start.hobj == hobject_t() || + start == c->cid.get_min_hobj() || + start.hobj.is_temp()) + && coll_range_temp_start != coll_range_temp_end) { + ranges.push_back(std::tuple(std::move(coll_range_temp_start), std::move(coll_range_temp_end))); } - if (end.hobj.is_max()) { - pend = temp ? coll_range_temp_end : coll_range_end; - } else { - if (end.hobj.is_temp()) { - if (temp) { - pend = end; - } else { - *pnext = ghobject_t::get_max(); - return 0; - } + // if end param is in temp section, then we do not need to proceed to the normal section + if (!end.hobj.is_temp()) { + ranges.push_back(std::tuple(std::move(coll_range_start), std::move(coll_range_end))); + } + + for (const auto & [cur_range_start, cur_range_end] : ranges) { + dout(30) << __func__ << " cur_range " << cur_range_start << " to " << cur_range_end << dendl; + + const ghobject_t low = start > cur_range_start ? start : cur_range_start; + const ghobject_t high = end < cur_range_end ? end : cur_range_end; + if (low >= high) { + continue; + } + + std::string kv_low_key, kv_high_key; + _key_encode_prefix(low, &kv_low_key); + _key_encode_prefix(high, &kv_high_key); + kv_high_key.push_back('\xff'); + dout(30) << __func__ << " kv_low_key: " << kv_low_key << " kv_high_key: " << kv_high_key << dendl; + const KeyValueDB::IteratorBounds bounds = KeyValueDB::IteratorBounds{std::move(kv_low_key), std::move(kv_high_key)}; + if (legacy) { + it = std::make_unique( + cct, db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } else { - pend = temp ? coll_range_temp_end : end; + it = std::make_unique( + db->get_iterator(PREFIX_OBJ, 0, std::move(bounds))); } - } - dout(20) << __func__ << " pend " << pend << dendl; - while (true) { - if (!it->valid() || it->is_ge(pend)) { - if (!it->valid()) - dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; - else - dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; - if (temp) { - if (end.hobj.is_temp()) { - if (it->valid() && it->is_lt(coll_range_temp_end)) { - *pnext = it->oid(); - return 0; - } - break; - } - dout(30) << __func__ << " switch to non-temp namespace" << dendl; - temp = false; - it->upper_bound(coll_range_start); - if (end.hobj.is_max()) - pend = coll_range_end; - else - pend = end; - dout(30) << __func__ << " pend " << pend << dendl; - continue; + it->lower_bound(low); + while (it->valid()) { + if (it->oid() < low) { + it->next(); + continue; + } + if (it->oid() > high) { + break; } - if (it->valid() && it->is_lt(coll_range_end)) { + if (ls->size() >= (unsigned)max || it->oid() == high) { *pnext = it->oid(); return 0; } - break; - } - dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl; - if (ls->size() >= (unsigned)max) { - dout(20) << __func__ << " reached max " << max << dendl; - *pnext = it->oid(); - return 0; + dout(20) << __func__ << " oid " << it->oid() << dendl; + ls->push_back(it->oid()); + it->next(); } - ls->push_back(it->oid()); - it->next(); } *pnext = ghobject_t::get_max(); return 0;