From 690739e821557cb6b256cd35446083d2978d53b2 Mon Sep 17 00:00:00 2001 From: Gabriel BenHanokh Date: Wed, 14 Dec 2022 12:55:19 +0000 Subject: [PATCH] =?utf8?q?osd/SnapMapper:=20Maintain=20the=20prefix=5Fitr?= =?utf8?q?=20between=20calls=20to=20SnapMapper::get=5Fnext=5Fobjects=5Fto?= =?utf8?q?=5Ftrim()=20to=20prevent=20searching=20depleted=20prefixes.=20We?= =?utf8?q?=20got=208=20distinct=20hash=20prefixes=20used=20for=20searching?= =?utf8?q?=20objects=20owned=20by=20a=20given=20PG.=20On=20each=20call=20t?= =?utf8?q?o=20SnapMapper::get=5Fnext=5Fobjects=5Fto=5Ftrim()=20we=20start?= =?utf8?q?=20from=20the=20first=20prefix=20even=20after=20all=20objects=20?= =?utf8?q?mapped=20to=20it=20were=20depleted.=20This=20means=20that=20we?= =?utf8?q?=20will=20be=20searching=20for=201=20non-existing=20prefix=20aft?= =?utf8?q?er=20the=20first=20prefix=20was=20depleted,=202=20after=20the=20?= =?utf8?q?first=20two=20prefixes=20were=20depleted...=20and=20so=20on=20un?= =?utf8?q?til=20we=20will=20search=207=20non-existing=20prefixes=20after?= =?utf8?q?=20the=20first=207=20prefixes=20were=C2=A0depleted.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is a performance improvement PR only! It maintains the existing behavior and does not try to fix/change any of the TRIM logic. I added an extra step after the last object is trimmed doing a full scan of the DB and only if no object was found it will return ENOENT. This should make the new code no-worse than existing code which returns ENOENT after a full scan found no object. It should not impact performance in real life snaps as it should only happen once per-snap. added snap-mapper tests to rados-test-suite disabled osd_debug_trim_objects when running (SnapMapperTest, prefix_itr) to prevent asserts(as this code does illegal inserts into DELETED snaps) Code beautifing Signed-off-by: Gabriel BenHanokh --- .../singleton-nomsgr/all/ceph-snapmapper.yaml | 22 ++ qa/tasks/ceph.conf.template | 3 +- src/common/options/osd.yaml.in | 5 + src/osd/SnapMapper.cc | 70 +++- src/osd/SnapMapper.h | 25 +- src/test/CMakeLists.txt | 4 + src/test/test_snap_mapper.cc | 323 ++++++++++++++++-- 7 files changed, 403 insertions(+), 49 deletions(-) create mode 100644 qa/suites/rados/singleton-nomsgr/all/ceph-snapmapper.yaml diff --git a/qa/suites/rados/singleton-nomsgr/all/ceph-snapmapper.yaml b/qa/suites/rados/singleton-nomsgr/all/ceph-snapmapper.yaml new file mode 100644 index 0000000000000..cbfd08620e768 --- /dev/null +++ b/qa/suites/rados/singleton-nomsgr/all/ceph-snapmapper.yaml @@ -0,0 +1,22 @@ +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +roles: +- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0] + +overrides: + ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + log-ignorelist: + - but it is still running + - overall HEALTH_ + - \(POOL_APP_NOT_ENABLED\) + +tasks: +- install: +- ceph: +- exec: + client.0: + - ceph_test_snap_mapper diff --git a/qa/tasks/ceph.conf.template b/qa/tasks/ceph.conf.template index a9cce29539aa1..7ae2b83c29515 100644 --- a/qa/tasks/ceph.conf.template +++ b/qa/tasks/ceph.conf.template @@ -55,8 +55,9 @@ osd debug shutdown = true osd debug op order = true osd debug verify stray on activate = true + osd debug trim objects = true - osd open classes on start = true + osd open classes on start = true osd debug pg log writeout = true osd deep scrub update digest min age = 30 diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in index 067e04df946f4..ed81114cfe3ce 100644 --- a/src/common/options/osd.yaml.in +++ b/src/common/options/osd.yaml.in @@ -194,6 +194,11 @@ options: load on busy clusters. default: false with_legacy: true +- name: osd_debug_trim_objects + type: bool + level: advanced + desc: Asserts that no clone-objects were added to a snap after we start trimming it + default: false - name: osd_repair_during_recovery type: bool level: advanced diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc index 7893bc08fdcb6..b12e063548a9c 100644 --- a/src/osd/SnapMapper.cc +++ b/src/osd/SnapMapper.cc @@ -540,36 +540,28 @@ void SnapMapper::add_oid( backend.set_keys(to_add, t); } -int SnapMapper::get_next_objects_to_trim( +void SnapMapper::get_objects_by_prefixes( snapid_t snap, unsigned max, vector *out) { - ceph_assert(out); - ceph_assert(out->empty()); - - // if max would be 0, we return ENOENT and the caller would mistakenly - // trim the snaptrim queue - ceph_assert(max > 0); - int r = 0; - - /// \todo cache the prefixes-set in update_bits() - for (set::iterator i = prefixes.begin(); - i != prefixes.end() && out->size() < max && r == 0; - ++i) { - string prefix(get_prefix(pool, snap) + *i); + /// maintain the prefix_itr between calls to avoid searching depleted prefixes + for ( ; prefix_itr != prefixes.end(); prefix_itr++) { + string prefix(get_prefix(pool, snap) + *prefix_itr); string pos = prefix; while (out->size() < max) { pair next; - r = backend.get_next(pos, &next); + // access RocksDB (an expensive operation!) + int r = backend.get_next(pos, &next); dout(20) << __func__ << " get_next(" << pos << ") returns " << r << " " << next << dendl; if (r != 0) { - break; // Done + return; // Done } if (next.first.substr(0, prefix.size()) != prefix) { + // TBD: we access the DB twice for the first object of each iterator... break; // Done with this prefix } @@ -583,7 +575,52 @@ int SnapMapper::get_next_objects_to_trim( out->push_back(next_decoded.second); pos = next.first; } + + if (out->size() >= max) { + return; + } } +} + +int SnapMapper::get_next_objects_to_trim( + snapid_t snap, + unsigned max, + vector *out) +{ + ceph_assert(out); + ceph_assert(out->empty()); + + // if max would be 0, we return ENOENT and the caller would mistakenly + // trim the snaptrim queue + ceph_assert(max > 0); + + // The prefix_itr is bound to a prefix_itr_snap so if we trim another snap + // we must reset the prefix_itr (should not happen normally) + if (prefix_itr_snap != snap) { + dout(10) << __func__ << "::Reset prefix_itr(unexpcted) snap was changed from <" + << prefix_itr_snap << "> to <" << snap << ">" << dendl; + reset_prefix_itr(snap); + } + + // when reaching the end of the DB reset the prefix_ptr and verify + // we didn't miss objects which were added after we started trimming + // This should never happen in reality because the snap was logically deleted + // before trimming starts (and so no new clone-objects could be added) + // For more info see PG::filter_snapc() + // + // We still like to be extra careful and run one extra loop over all prefeixes + get_objects_by_prefixes(snap, max, out); + if (out->size() == 0) { + dout(10) << __func__ << "::Reset prefix_itr after a complete trim!" << dendl; + reset_prefix_itr(snap); + get_objects_by_prefixes(snap, max, out); + + bool osd_debug_trim_objects = cct->_conf.get_val("osd_debug_trim_objects"); + if (osd_debug_trim_objects) { + ceph_assert(out->size() == 0); + } + } + if (out->size() == 0) { return -ENOENT; } else { @@ -591,7 +628,6 @@ int SnapMapper::get_next_objects_to_trim( } } - int SnapMapper::remove_oid( const hobject_t &oid, MapCacher::Transaction *t) diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h index eb43a23c2b0e0..d2dcf69324bf7 100644 --- a/src/osd/SnapMapper.h +++ b/src/osd/SnapMapper.h @@ -281,6 +281,24 @@ private: tl::expected get_snaps_common( const hobject_t &hoid) const; + /// file @out vector with the first objects with @snap as a snap + void get_objects_by_prefixes( + snapid_t snap, + unsigned max, + std::vector *out); + + std::set prefixes; + // maintain a current active prefix + std::set::iterator prefix_itr; + // associate the active prefix with a snap + snapid_t prefix_itr_snap; + + // reset the prefix iterator to the first prefix hash + void reset_prefix_itr(snapid_t snap) { + prefix_itr_snap = snap; + prefix_itr = prefixes.begin(); + } + public: static std::string make_shard_prefix(shard_id_t shard) { if (shard == shard_id_t::NO_SHARD) @@ -309,7 +327,6 @@ private: update_bits(mask_bits); } - std::set prefixes; /// Update bits in case of pg split or merge void update_bits( uint32_t new_bits ///< [in] new split bits @@ -323,6 +340,12 @@ private: for (auto i = _prefixes.begin(); i != _prefixes.end(); ++i) { prefixes.insert(shard_prefix + *i); } + + reset_prefix_itr(CEPH_NOSNAP); + } + + const std::set::iterator get_prefix_itr() { + return prefix_itr; } /// Update snaps for oid, empty new_snaps removes the mapping diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 09281ab2dbf54..5bfa5397a31f0 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -466,6 +466,10 @@ add_executable(ceph_test_snap_mapper $ ) target_link_libraries(ceph_test_snap_mapper osd global ${BLKID_LIBRARIES} ${UNITTEST_LIBS}) + +install(TARGETS + ceph_test_snap_mapper + DESTINATION ${CMAKE_INSTALL_BINDIR}) endif(NOT WIN32) add_executable(ceph_test_stress_watch diff --git a/src/test/test_snap_mapper.cc b/src/test/test_snap_mapper.cc index e502892cc42f1..6e7b4c566043c 100644 --- a/src/test/test_snap_mapper.cc +++ b/src/test/test_snap_mapper.cc @@ -461,7 +461,19 @@ public: uint32_t bits) : driver(driver), mapper(new SnapMapper(g_ceph_context, driver, mask, bits, 0, shard_id_t(1))), - mask(mask), bits(bits) {} + mask(mask), bits(bits) {} + + hobject_t create_hobject( + unsigned idx, + snapid_t snapid, + int64_t pool, + const std::string& nspace) { + const object_t oid("OID" + std::to_string(idx)); + const std::string key("KEY" + std::to_string(idx)); + const uint32_t hash = (idx & ((~0)< &snaps) { + hobject_to_snap[obj] = snaps; + for (auto snap : snaps) { + map >::iterator j = snap_to_hobject.find(snap); + ceph_assert(j != snap_to_hobject.end()); + j->second.insert(obj); + } + { + PausyAsyncMap::Transaction t; + mapper->add_oid(obj, snaps, &t); + driver->submit(&t); + } } void create_object() { @@ -494,20 +525,9 @@ public: obj = random_hobject(); } while (hobject_to_snap.count(obj)); - set &snaps = hobject_to_snap[obj]; + set snaps; choose_random_snaps(1 + (rand() % 20), &snaps); - for (set::iterator i = snaps.begin(); - i != snaps.end(); - ++i) { - map >::iterator j = snap_to_hobject.find(*i); - ceph_assert(j != snap_to_hobject.end()); - j->second.insert(obj); - } - { - PausyAsyncMap::Transaction t; - mapper->add_oid(obj, snaps, &t); - driver->submit(&t); - } + add_object_to_snaps(obj, snaps); } std::pair to_raw( @@ -535,27 +555,23 @@ public: return mapper->make_purged_snap_key(std::forward(args)...); } - void trim_snap() { - std::lock_guard l{lock}; - if (snap_to_hobject.empty()) - return; - map >::iterator snap = - rand_choose(snap_to_hobject); - set hobjects = snap->second; - + // must be called with lock held to protect access to + // snap_to_hobject and hobject_to_snap + int trim_snap(snapid_t snapid, unsigned max_count, vector & out) { + set& hobjects = snap_to_hobject[snapid]; vector hoids; - while (mapper->get_next_objects_to_trim( - snap->first, rand() % 5 + 1, &hoids) == 0) { + int ret = mapper->get_next_objects_to_trim(snapid, max_count, &hoids); + if (ret == 0) { + out.insert(out.end(), hoids.begin(), hoids.end()); for (auto &&hoid: hoids) { ceph_assert(!hoid.is_max()); ceph_assert(hobjects.count(hoid)); hobjects.erase(hoid); - map>::iterator j = - hobject_to_snap.find(hoid); - ceph_assert(j->second.count(snap->first)); + map>::iterator j = hobject_to_snap.find(hoid); + ceph_assert(j->second.count(snapid)); set old_snaps(j->second); - j->second.erase(snap->first); + j->second.erase(snapid); { PausyAsyncMap::Transaction t; @@ -573,6 +589,49 @@ public: } hoids.clear(); } + return ret; + } + + // must be called with lock held to protect access to + // snap_to_hobject and hobject_to_snap in trim_snap + // will keep trimming until reaching max_count or failing a call to trim_snap() + void trim_snap_force(snapid_t snapid, + unsigned max_count, + vector& out) { + + int guard = 1000; + vector tmp; + unsigned prev_size = 0; + while (tmp.size() < max_count) { + unsigned req_size = max_count - tmp.size(); + // each call adds more objects into the tmp vector + trim_snap(snapid, req_size, tmp); + if (prev_size < tmp.size()) { + prev_size = tmp.size(); + } + else{ + // the tmp vector size was not increased in the last call + // which means we were unable to find anything to trim + break; + } + ceph_assert(--guard > 0); + } + out.insert(out.end(), tmp.begin(), tmp.end()); + } + + void trim_snap() { + std::lock_guard l{lock}; + if (snap_to_hobject.empty()) { + return; + } + int ret = 0; + map >::iterator snap = rand_choose(snap_to_hobject); + do { + int max_count = rand() % 5 + 1; + vector out; + ret = trim_snap(snap->first, max_count, out); + } while(ret == 0); + set hobjects = snap->second; ceph_assert(hobjects.empty()); snap_to_hobject.erase(snap); } @@ -612,6 +671,189 @@ public: ceph_assert(r == 0); ASSERT_EQ(snaps, obj->second); } + + void test_prefix_itr() { + // protects access to snap_to_hobject and hobject_to_snap + std::lock_guard l{lock}; + snapid_t snapid = create_snap(); + // we initialize 32 PGS + ceph_assert(bits == 5); + + const int64_t pool(0); + const std::string nspace("GBH"); + set snaps = { snapid }; + set& hobjects = snap_to_hobject[snapid]; + vector trimmed_objs; + vector stored_objs; + + // add objects 0, 32, 64, 96, 128, 160, 192, 224 + // which should hit all the prefixes + for (unsigned idx = 0; idx < 8; idx++) { + hobject_t hobj = create_hobject(idx * 32, snapid, pool, nspace); + add_object_to_snaps(hobj, snaps); + stored_objs.push_back(hobj); + } + ceph_assert(hobjects.size() == 8); + + // trim 0, 32, 64, 96 + trim_snap(snapid, 4, trimmed_objs); + ceph_assert(hobjects.size() == 4); + + // add objects (3, 35, 67) before the prefix_itr position + // to force an iteartor reset later + for (unsigned idx = 0; idx < 3; idx++) { + hobject_t hobj = create_hobject(idx * 32 + 3, snapid, pool, nspace); + add_object_to_snaps(hobj, snaps); + stored_objs.push_back(hobj); + } + ceph_assert(hobjects.size() == 7); + + // will now trim 128, 160, 192, 224 + trim_snap(snapid, 4, trimmed_objs); + ceph_assert(hobjects.size() == 3); + + // finally, trim 3, 35, 67 + // This will force a reset to the prefix_itr (which is what we test here) + trim_snap(snapid, 3, trimmed_objs); + ceph_assert(hobjects.size() == 0); + + ceph_assert(trimmed_objs.size() == 11); + // trimmed objs must be in the same order they were inserted + // this will prove that the second call to add_object_to_snaps inserted + // them before the current prefix_itr + ceph_assert(trimmed_objs.size() == stored_objs.size()); + ceph_assert(std::equal(trimmed_objs.begin(), trimmed_objs.end(), + stored_objs.begin())); + snap_to_hobject.erase(snapid); + } + + // insert 256 objects which should populate multiple prefixes + // trim until we change prefix and then insert an old object + // which we know for certain belongs to a prefix before prefix_itr + void test_prefix_itr2() { + // protects access to snap_to_hobject and hobject_to_snap + std::lock_guard l{lock}; + snapid_t snapid = create_snap(); + // we initialize 32 PGS + ceph_assert(bits == 5); + + const int64_t pool(0); + const std::string nspace("GBH"); + set snaps = { snapid }; + vector trimmed_objs; + vector stored_objs; + + constexpr unsigned MAX_IDX = 256; + for (unsigned idx = 0; idx < MAX_IDX; idx++) { + hobject_t hobj = create_hobject(idx, snapid, pool, nspace); + add_object_to_snaps(hobj, snaps); + stored_objs.push_back(hobj); + } + + hobject_t dup_hobj; + bool found = false; + trim_snap(snapid, 1, trimmed_objs); + const std::set::iterator itr = mapper->get_prefix_itr(); + for (unsigned idx = 1; idx < MAX_IDX + 1; idx++) { + trim_snap(snapid, 1, trimmed_objs); + if (!found && mapper->get_prefix_itr() != itr) { + // we changed prefix -> insert an OBJ belonging to perv prefix + dup_hobj = create_hobject(idx - 1, snapid, pool, nspace); + add_object_to_snaps(dup_hobj, snaps); + stored_objs.push_back(dup_hobj); + found = true; + } + } + ceph_assert(found); + + sort(trimmed_objs.begin(), trimmed_objs.end()); + sort(stored_objs.begin(), stored_objs.end()); + ceph_assert(trimmed_objs.size() == MAX_IDX+1); + ceph_assert(trimmed_objs.size() == stored_objs.size()); + ceph_assert(std::equal(trimmed_objs.begin(), trimmed_objs.end(), + stored_objs.begin())); + snap_to_hobject.erase(snapid); + } + + void add_rand_hobjects(unsigned count, + snapid_t snapid, + int64_t pool, + const std::string& nspace, + vector& stored_objs) { + constexpr unsigned MAX_VAL = 1000; + set snaps = { snapid }; + for (unsigned i = 0; i < count; i++) { + hobject_t hobj; + do { + unsigned val = rand() % MAX_VAL; + hobj = create_hobject(val, snapid, pool, nspace); + }while (hobject_to_snap.count(hobj)); + add_object_to_snaps(hobj, snaps); + stored_objs.push_back(hobj); + } + } + + // Start with a set of random objects then run a partial trim + // followed by another random insert + // This should cause *some* objects to be added before the prefix_itr + // and will verify that we still remove them + void test_prefix_itr_rand() { + // protects access to snap_to_hobject and hobject_to_snap + std::lock_guard l{lock}; + snapid_t snapid = create_snap(); + // we initialize 32 PGS + ceph_assert(bits == 5); + + const int64_t pool(0); + const std::string nspace("GBH"); + vector trimmed_objs; + vector stored_objs; + set& hobjects = snap_to_hobject[snapid]; + ceph_assert(hobjects.size() == 0); + + // add 100 random objects + add_rand_hobjects(100, snapid, pool, nspace, stored_objs); + ceph_assert(hobjects.size() == 100); + + // trim the first 75 objects leaving 25 objects + trim_snap(snapid, 75, trimmed_objs); + ceph_assert(hobjects.size() == 25); + + // add another 25 random objects (now we got 50 objects) + add_rand_hobjects(25, snapid, pool, nspace, stored_objs); + ceph_assert(hobjects.size() == 50); + + // trim 49 objects leaving a single object + // we must use a wrapper function to keep trimming while until -ENOENT + trim_snap_force(snapid, 49, trimmed_objs); + ceph_assert(hobjects.size() == 1); + + // add another 9 random objects (now we got 10 objects) + add_rand_hobjects(9, snapid, pool, nspace, stored_objs); + ceph_assert(hobjects.size() == 10); + + // trim 10 objects leaving no object in store + trim_snap_force(snapid, 10, trimmed_objs); + ceph_assert(hobjects.size() == 0); + + // add 10 random objects (now we got 10 objects) + add_rand_hobjects(10, snapid, pool, nspace, stored_objs); + ceph_assert(hobjects.size() == 10); + + // trim 10 objects leaving no object in store + trim_snap_force(snapid, 10, trimmed_objs); + ceph_assert(hobjects.size() == 0); + + sort(trimmed_objs.begin(), trimmed_objs.end()); + sort(stored_objs.begin(), stored_objs.end()); + ceph_assert(trimmed_objs.size() == 144); + ceph_assert(trimmed_objs.size() == stored_objs.size()); + + bool are_equal = std::equal(trimmed_objs.begin(), trimmed_objs.end(), + stored_objs.begin()); + ceph_assert(are_equal); + snap_to_hobject.erase(snapid); + } }; class SnapMapperTest : public ::testing::Test { @@ -675,6 +917,27 @@ protected: } }; +// This test creates scenarios which are impossible to get with normal code. +// The normal code deletes the snap before calling TRIM and so no new clones +// can be added to that snap. +// Our test calls get_next_objects_to_trim() *without* deleting the snap first. +// This allows us to add objects to the (non-deleted) snap after trimming began. +// We test that SnapTrim will find them even when added into positions before the prefix_itr. +// Since those tests are doing illegal inserts we must disable osd_debug_trim_objects +// during those tests as otherwise the code will assert. +TEST_F(SnapMapperTest, prefix_itr) { + bool orig_val = g_ceph_context->_conf.get_val("osd_debug_trim_objects"); + std::cout << "osd_debug_trim_objects = " << orig_val << std::endl; + g_ceph_context->_conf.set_val("osd_debug_trim_objects", std::to_string(false)); + init(32); + get_tester().test_prefix_itr(); + get_tester().test_prefix_itr2(); + get_tester().test_prefix_itr_rand(); + g_ceph_context->_conf.set_val("osd_debug_trim_objects", std::to_string(orig_val)); + bool curr_val = g_ceph_context->_conf.get_val("osd_debug_trim_objects"); + ceph_assert(curr_val == orig_val); +} + TEST_F(SnapMapperTest, Simple) { init(1); get_tester().create_snap(); -- 2.39.5