From: Laura Flores Date: Fri, 28 Feb 2025 06:04:42 +0000 (+0000) Subject: osd: add pg-upmap-primary to clean_pg_upmaps X-Git-Tag: testing/wip-vshankar-testing-20260218.045142~16^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e37d02a3a591f559710471d4ce3dcef76bb9565f;p=ceph-ci.git osd: add pg-upmap-primary to clean_pg_upmaps This commit adds handling of pg_upmap_primary to `clean_pg_upmaps` so invalid mappings are properly removed. Invalid mappings occur when pools are deleted, the PG size is decreased, or an OSD is marked down/out. Test cases cover cases where: 1. PG num is reduced (all pg_upmap_primary mappings should be canceled) 2. An OSD outside of the up set has been mapped as the primary 3. An OSD is marked "out" 4. A mapping is redundant 5. A pool is deleted Run the unit test with: $ ninja unittest_osdmap $ ./bin/unittest_osdmap --gtest_filter=*CleanPGUpmapPrimaries* --debug_osd=10 Fixes: https://tracker.ceph.com/issues/67265 Signed-off-by: Laura Flores --- diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index c29367a4970..7ee1a6da769 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -275,12 +275,16 @@ public: void process(const std::vector& to_check) override { std::vector to_cancel; + std::vector to_cancel_upmap_primary_only; + std::set affected_pools; std::map>> to_remap; - osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); + osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_cancel_upmap_primary_only, + &affected_pools, &to_remap); // don't bother taking lock if nothing changes - if (!to_cancel.empty() || !to_remap.empty()) { + if (!to_cancel.empty() || !to_remap.empty() || !to_cancel_upmap_primary_only.empty()) { std::lock_guard l(pending_inc_lock); - osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap); + osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_cancel_upmap_primary_only, + affected_pools, to_remap); } } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 8f6a8c08b87..46d1a6bc410 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -2102,17 +2102,21 @@ void OSDMap::clean_temps(CephContext *cct, void OSDMap::get_upmap_pgs(vector *upmap_pgs) const { - upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size()); + upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size() + pg_upmap_primaries.size()); for (auto& p : pg_upmap) upmap_pgs->push_back(p.first); for (auto& p : pg_upmap_items) upmap_pgs->push_back(p.first); + for (auto& p : pg_upmap_primaries) + upmap_pgs->push_back(p.first); } bool OSDMap::check_pg_upmaps( CephContext *cct, const vector& to_check, vector *to_cancel, + vector *to_cancel_upmap_primary_only, + set *affected_pools, map>> *to_remap) const { bool any_change = false; @@ -2123,12 +2127,14 @@ bool OSDMap::check_pg_upmaps( ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source" << dendl; to_cancel->push_back(pg); + affected_pools->emplace(pg.pool()); continue; } if (pi->is_pending_merge(pg, nullptr)) { ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge" << dendl; to_cancel->push_back(pg); + affected_pools->emplace(pg.pool()); continue; } vector raw, up; @@ -2182,6 +2188,7 @@ bool OSDMap::check_pg_upmaps( } if (!to_cancel->empty() && to_cancel->back() == pg) continue; + // okay, upmap is valid // continue to check if it is still necessary auto i = pg_upmap.find(pg); @@ -2234,8 +2241,28 @@ bool OSDMap::check_pg_upmaps( any_change = true; } } + // Cancel any pg_upmap_primary mapping where the mapping is set + // to an OSD outside the raw set, or if the mapping is redundant + auto k = pg_upmap_primaries.find(pg); + if (k != pg_upmap_primaries.end()) { + auto curr_prim = k->second; + bool valid_prim = false; + for (auto osd : raw) { + if ((curr_prim == osd) && + (curr_prim != raw.front())) { + valid_prim = true; + break; + } + } + if (!valid_prim) { + ldout(cct, 10) << __func__ << " pg_upmap_primary (PG " << pg << " has an invalid or redundant primary: " + << curr_prim << " -> " << raw << ")" << dendl; + to_cancel_upmap_primary_only->push_back(pg); + any_change = true; + } + } } - any_change = any_change || !to_cancel->empty(); + any_change = any_change || !to_cancel->empty() || !to_cancel_upmap_primary_only->empty(); return any_change; } @@ -2243,6 +2270,8 @@ void OSDMap::clean_pg_upmaps( CephContext *cct, Incremental *pending_inc, const vector& to_cancel, + const vector& to_cancel_upmap_primary_only, + const set& affected_pools, const map>>& to_remap) const { for (auto &pg: to_cancel) { @@ -2261,6 +2290,21 @@ void OSDMap::clean_pg_upmaps( << dendl; pending_inc->old_pg_upmap.insert(pg); } + auto k = pending_inc->new_pg_upmap_primary.find(pg); + if (k != pending_inc->new_pg_upmap_primary.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap_primaries entry " + << k->first << "->" << k->second + << dendl; + pending_inc->new_pg_upmap_primary.erase(k); + } + auto l = pg_upmap_primaries.find(pg); + if (l != pg_upmap_primaries.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pg_upmap_primaries entry " + << l->first << "->" << l->second + << dendl; + pending_inc->old_pg_upmap_primary.insert(pg); + } auto p = pending_inc->new_pg_upmap_items.find(pg); if (p != pending_inc->new_pg_upmap_items.end()) { ldout(cct, 10) << __func__ << " cancel invalid pending " @@ -2280,6 +2324,36 @@ void OSDMap::clean_pg_upmaps( } for (auto& i : to_remap) pending_inc->new_pg_upmap_items[i.first] = i.second; + + // Cancel mappings that are only invalid for pg_upmap_primary. + // For example, if a selected primary OSD does not exist + // in that PG's up set, it should be canceled. But this + // could be valid for pg_upmap/pg_upmap_items. + for (auto &pg_prim: to_cancel_upmap_primary_only) { + auto k = pending_inc->new_pg_upmap_primary.find(pg_prim); + if (k != pending_inc->new_pg_upmap_primary.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pending " + << "pg_upmap_primaries entry " + << k->first << "->" << k->second + << dendl; + pending_inc->new_pg_upmap_primary.erase(k); + } + auto l = pg_upmap_primaries.find(pg_prim); + if (l != pg_upmap_primaries.end()) { + ldout(cct, 10) << __func__ << " cancel invalid pg_upmap_primaries entry " + << l->first << "->" << l->second + << dendl; + pending_inc->old_pg_upmap_primary.insert(pg_prim); + } + } + + // Clean all pg_upmap_primary entries where the pool size was changed, + // as old records no longer make sense optimization-wise. + for (auto pid : affected_pools) { + ldout(cct, 10) << __func__ << " cancel all pg_upmap_primaries for pool " << pid + << " since pg_num changed" << dendl; + rm_all_upmap_prims(cct, pending_inc, pid); + } } bool OSDMap::clean_pg_upmaps( @@ -2289,14 +2363,14 @@ bool OSDMap::clean_pg_upmaps( ldout(cct, 10) << __func__ << dendl; vector to_check; vector to_cancel; + vector to_cancel_upmap_primary_only; + set affected_pools; map>> to_remap; get_upmap_pgs(&to_check); - auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); - clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap); - //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked - // and cleaned in the same way as pg_upmap. This is not critical since invalid - // pg_upmap_primaries are never applied, (the final check is in _apply_upmap). + auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_cancel_upmap_primary_only, + &affected_pools, &to_remap); + clean_pg_upmaps(cct, pending_inc, to_cancel, to_cancel_upmap_primary_only, affected_pools, to_remap); return any_change; } @@ -5347,17 +5421,21 @@ int OSDMap::balance_primaries( return num_changes; } -void OSDMap::rm_all_upmap_prims(CephContext *cct, OSDMap::Incremental *pending_inc, uint64_t pid) { +void OSDMap::rm_all_upmap_prims( + CephContext *cct, + OSDMap::Incremental *pending_inc, + uint64_t pid) const +{ map> prim_pgs_by_osd; get_pgs_by_osd(cct, pid, &prim_pgs_by_osd); for (auto &[_, pgs] : prim_pgs_by_osd) { for (auto &pg : pgs) { if (pending_inc->new_pg_upmap_primary.contains(pg)) { - ldout(cct,30) << __func__ << "Removing pending pg_upmap_prim for pg " << pg << dendl; + ldout(cct, 30) << __func__ << " Removing pending pg_upmap_prim for pg " << pg << dendl; pending_inc->new_pg_upmap_primary.erase(pg); } if (pg_upmap_primaries.contains(pg)) { - ldout(cct, 30) << __func__ << "Removing pg_upmap_prim for pg " << pg << dendl; + ldout(cct, 30) << __func__ << " Removing pg_upmap_prim for pg " << pg << dendl; pending_inc->old_pg_upmap_primary.insert(pg); } } @@ -5366,7 +5444,7 @@ void OSDMap::rm_all_upmap_prims(CephContext *cct, OSDMap::Incremental *pending_i void OSDMap::rm_all_upmap_prims( CephContext *cct, - OSDMap::Incremental *pending_inc) + OSDMap::Incremental *pending_inc) const { for (const auto& [pg, _] : pg_upmap_primaries) { if (pending_inc->new_pg_upmap_primary.contains(pg)) { @@ -6014,11 +6092,18 @@ map> OSDMap::get_pgs_by_osd( OSDMap tmp_osd_map; tmp_osd_map.deepish_copy_from(*this); + // Set up map to return + map> pgs_by_osd; + // Get the pool from the provided pool id const pg_pool_t* pool = get_pg_pool(pid); + if (!pool) { + ldout(cct, 20) << __func__ << " pool " << pid + << " does not exist" << dendl; + return pgs_by_osd; + } // build array of pgs from the pool - map> pgs_by_osd; for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) { pg_t pg(ps, pid); vector up; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index d48d967d6ca..896407235ec 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -1137,16 +1137,21 @@ public: */ uint64_t get_up_osd_features() const; + int get_num_pg_upmap_primaries() const { return pg_upmap_primaries.size(); }; void get_upmap_pgs(std::vector *upmap_pgs) const; bool check_pg_upmaps( CephContext *cct, const std::vector& to_check, std::vector *to_cancel, + std::vector *to_cancel_upmap_primary_only, + std::set *affected_pools, std::map>> *to_remap) const; void clean_pg_upmaps( CephContext *cct, Incremental *pending_inc, const std::vector& to_cancel, + const std::vector& to_cancel_upmap_primary_only, + const std::set& affected_pools, const std::map>>& to_remap) const; bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const; @@ -1522,10 +1527,13 @@ public: OSDMap& tmp_osd_map, const std::optional& rbp = std::nullopt) const; - void rm_all_upmap_prims(CephContext *cct, Incremental *pending_inc, uint64_t pid); // per pool void rm_all_upmap_prims( CephContext *cct, - OSDMap::Incremental *pending_inc); // total + Incremental *pending_inc, + uint64_t pid) const; // per pool + void rm_all_upmap_prims( + CephContext *cct, + OSDMap::Incremental *pending_inc) const; // total int calc_desired_primary_distribution( CephContext *cct, diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc index 1d4e1eaef08..05bc3a2b3ce 100644 --- a/src/test/osd/TestOSDMap.cc +++ b/src/test/osd/TestOSDMap.cc @@ -1366,6 +1366,220 @@ TEST_F(OSDMapTest, CleanPGUpmaps) { } } +TEST_F(OSDMapTest, CleanPGUpmapPrimaries) { + // Set up a default osdmap + set_up_map(); + + // Arrange CRUSH topology with 3 hosts; 2 OSDs per host + const int EXPECTED_HOST_NUM = 3; + int osd_per_host = get_num_osds() / EXPECTED_HOST_NUM; + ASSERT_GE(osd_per_host, 2); + int index = 0; + for (int i = 0; i < (int)get_num_osds(); i++) { + if (i && i % osd_per_host == 0) { + ++index; + } + stringstream osd_name; + stringstream host_name; + vector move_to; + osd_name << "osd." << i; + host_name << "host-" << index; + move_to.push_back("root=default"); + string host_loc = "host=" + host_name.str(); + move_to.push_back(host_loc); + int r = crush_move(osdmap, osd_name.str(), move_to); + ASSERT_EQ(0, r); + } + + // Create a replicated CRUSH rule + const string upmap_primary_rule = "upmap_primary"; + int upmap_primary_rule_no = crush_rule_create_replicated( + upmap_primary_rule, "default", "host"); + ASSERT_LT(0, upmap_primary_rule_no); + + // Create a replicated pool (size 3) with 64 PGs + OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1); + new_pool_inc.new_pool_max = osdmap.get_pool_max(); + new_pool_inc.fsid = osdmap.get_fsid(); + pg_pool_t empty; + uint64_t upmap_primary_pool_id = ++new_pool_inc.new_pool_max; + pg_pool_t *p = new_pool_inc.get_new_pool(upmap_primary_pool_id, &empty); + p->size = 3; + p->set_pg_num(64); + p->set_pgp_num(64); + p->type = pg_pool_t::TYPE_REPLICATED; + p->crush_rule = upmap_primary_rule_no; + p->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + new_pool_inc.new_pool_names[upmap_primary_pool_id] = "upmap_primary_pool"; + osdmap.apply_incremental(new_pool_inc); + + { + // Sanity check - we shouldn't have any pg_upmap_primary entries yet + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0); + + // Create valid pg_upmap_primary mappings on the "upmap_primary_pool" + OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1); + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(osdmap); + int num_changes = osdmap.balance_primaries(g_ceph_context, upmap_primary_pool_id, + &pending_inc, tmp_osd_map); + osdmap.apply_incremental(pending_inc); + + // Check for mappings; `balance_primaries` should have created 10 pg_upmap_primary + // mappings for this configuration + ASSERT_EQ(num_changes, 10); + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10); + } + + // -------TEST 1: Reduce number of PGs -------------- + // For the sake of the unit test, we will simulate reducing the pg_num by removing + // the current pool and swapping it out with a new pool that has all the same info, + // except a decreased pg_num + { + // Remove current pool + OSDMap::Incremental remove_pool_inc(osdmap.get_epoch() + 1); + remove_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("upmap_primary_pool")); + osdmap.apply_incremental(remove_pool_inc); + + // Add same pool back in with reduced pg_num (64 --> 16) + OSDMap::Incremental reduced_pool_inc(osdmap.get_epoch() + 1); + pg_pool_t new_empty; + pg_pool_t *reduced_pool = reduced_pool_inc.get_new_pool(upmap_primary_pool_id, &new_empty); + reduced_pool->size = 3; + reduced_pool->set_pg_num(16); + reduced_pool->set_pgp_num(16); + reduced_pool->type = pg_pool_t::TYPE_REPLICATED; + reduced_pool->crush_rule = upmap_primary_rule_no; + reduced_pool->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + reduced_pool_inc.new_pool_names[upmap_primary_pool_id] = "upmap_primary_pool"; + osdmap.apply_incremental(reduced_pool_inc); + } + { + // Clean invalid pg_upmap_primary entries + OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc); + osdmap.apply_incremental(cleanup_inc); + + // Ensure that all pg_upmap_primary mappings have been removed + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0); + } + + // -------TEST 2: Create Invalid Mapping -------------- + { + // Create 10 valid pg_upmap_primary mappings on the "upmap_primary_pool". + // Valid mappings are where a new primary is set to an OSD that already exists + // in the PG up set but which is not the current primary OSD. + OSDMap::Incremental valid_mappings_inc(osdmap.get_epoch() + 1); + std::cout << osdmap << std::endl; + for (int pg_num = 0; pg_num < 10; ++pg_num) { + pg_t rawpg(pg_num, upmap_primary_pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector raw, up; + osdmap.pg_to_raw_upmap(rawpg, &raw, &up); + int new_primary = raw[1]; // pick the second OSD to be the new prim (this is valid!) + ASSERT_NE(raw[0], new_primary); + valid_mappings_inc.new_pg_upmap_primary[pgid] = new_primary; + } + osdmap.apply_incremental(valid_mappings_inc); + + // Check for 10 pg_upmap_primary mappings. + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10); + } + { + // Make an illegal mapping on the next PG by setting the primary + // to an OSD that exists outside the up set. + OSDMap::Incremental illegal_mapping_inc(osdmap.get_epoch() +1); + pg_t rawpg(10, upmap_primary_pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector raw, up; + osdmap.pg_to_raw_upmap(rawpg, &raw, &up); + int new_primary = 2; + for (int osd : raw) { + ASSERT_NE(osd, new_primary); + } + illegal_mapping_inc.new_pg_upmap_primary[pgid] = new_primary; + osdmap.apply_incremental(illegal_mapping_inc); + } + { + // Check for 11 mappings (10 legal, 1 illegal) + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 11); + + // Clean invalid pg_upmap_primary entry + OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc); + osdmap.apply_incremental(cleanup_inc); + + // Ensure that the illegal mapping was removed + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10); + } + + // -------TEST 3: Mark an OSD Out -------------- + { + // mark one of the OSDs out + OSDMap::Incremental out_osd_inc(osdmap.get_epoch() + 1); + out_osd_inc.new_weight[0] = CEPH_OSD_OUT; + osdmap.apply_incremental(out_osd_inc); + ASSERT_TRUE(osdmap.is_out(0)); + std::cout << osdmap << std::endl; + } + { + // Clean mappings for out OSDs + OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc); + osdmap.apply_incremental(cleanup_inc); + + // Ensure that mappings with out OSDs were removed + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7); + } + + // -------TEST 4: Create redundant mapping ----- + { + // Make a redundant mapping on the next PG by setting the primary + // to an OSD that is already primary. + OSDMap::Incremental redundant_mapping_inc(osdmap.get_epoch() +1); + pg_t rawpg(10, upmap_primary_pool_id); + pg_t pgid = osdmap.raw_pg_to_pg(rawpg); + vector raw, up; + osdmap.pg_to_raw_upmap(rawpg, &raw, &up); + int new_primary = 3; + ASSERT_EQ(raw[0], new_primary); + redundant_mapping_inc.new_pg_upmap_primary[pgid] = new_primary; + osdmap.apply_incremental(redundant_mapping_inc); + } + { + // Check for 8 mappings (7 legal, 1 illegal) + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 8); + + // Clean invalid pg_upmap_primary entry + OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc); + osdmap.apply_incremental(cleanup_inc); + + // Ensure that the illegal mapping was removed + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7); + } + + // -------TEST 5: Delete the pool -------------- + { + // Delete "upmap_primary_pool" + OSDMap::Incremental delete_pool_inc(osdmap.get_epoch() + 1); + delete_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("upmap_primary_pool")); + osdmap.apply_incremental(delete_pool_inc); + } + { + // Verify that mappings (now invalid) still exist before cleanup + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7); + + // Clean invalid pg_upmap_primary entries + OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1); + clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc); + osdmap.apply_incremental(cleanup_inc); + + // Ensure that all pg_upmap_primary mappings have been removed + ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0); + } +} + TEST_F(OSDMapTest, BUG_38897) { // http://tracker.ceph.com/issues/38897 // build a fresh map with 12 OSDs, without any default pools diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index 168dc6aa811..f74a593bd5a 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -111,6 +111,11 @@ void print_inc_upmaps(const OSDMap::Incremental& pending_inc, int fd, bool vstar } ss << std::endl; } + for (auto& i : pending_inc.old_pg_upmap_primary) { + if (vstart) + ss << prefix; + ss << cmd + " osd rm-pg-upmap-primary " << i << std::endl; + } for (auto& i : pending_inc.new_pg_upmap_primary) { if (vstart) ss << prefix;