void process(const std::vector<pg_t>& to_check) override {
std::vector<pg_t> to_cancel;
+ std::vector<pg_t> to_cancel_upmap_primary_only;
+ std::set<uint64_t> affected_pools;
std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap;
- osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
+ osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_cancel_upmap_primary_only,
+ &affected_pools, &to_remap);
// don't bother taking lock if nothing changes
- if (!to_cancel.empty() || !to_remap.empty()) {
+ if (!to_cancel.empty() || !to_remap.empty() || !to_cancel_upmap_primary_only.empty()) {
std::lock_guard l(pending_inc_lock);
- osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap);
+ osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_cancel_upmap_primary_only,
+ affected_pools, to_remap);
}
}
void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
{
- upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
+ upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size() + pg_upmap_primaries.size());
for (auto& p : pg_upmap)
upmap_pgs->push_back(p.first);
for (auto& p : pg_upmap_items)
upmap_pgs->push_back(p.first);
+ for (auto& p : pg_upmap_primaries)
+ upmap_pgs->push_back(p.first);
}
bool OSDMap::check_pg_upmaps(
CephContext *cct,
const vector<pg_t>& to_check,
vector<pg_t> *to_cancel,
+ vector<pg_t> *to_cancel_upmap_primary_only,
+ set<uint64_t> *affected_pools,
map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
{
bool any_change = false;
ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
<< dendl;
to_cancel->push_back(pg);
+ affected_pools->emplace(pg.pool());
continue;
}
if (pi->is_pending_merge(pg, nullptr)) {
ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
<< dendl;
to_cancel->push_back(pg);
+ affected_pools->emplace(pg.pool());
continue;
}
vector<int> raw, up;
}
if (!to_cancel->empty() && to_cancel->back() == pg)
continue;
+
// okay, upmap is valid
// continue to check if it is still necessary
auto i = pg_upmap.find(pg);
any_change = true;
}
}
+ // Cancel any pg_upmap_primary mapping where the mapping is set
+ // to an OSD outside the raw set, or if the mapping is redundant
+ auto k = pg_upmap_primaries.find(pg);
+ if (k != pg_upmap_primaries.end()) {
+ auto curr_prim = k->second;
+ bool valid_prim = false;
+ for (auto osd : raw) {
+ if ((curr_prim == osd) &&
+ (curr_prim != raw.front())) {
+ valid_prim = true;
+ break;
+ }
+ }
+ if (!valid_prim) {
+ ldout(cct, 10) << __func__ << " pg_upmap_primary (PG " << pg << " has an invalid or redundant primary: "
+ << curr_prim << " -> " << raw << ")" << dendl;
+ to_cancel_upmap_primary_only->push_back(pg);
+ any_change = true;
+ }
+ }
}
- any_change = any_change || !to_cancel->empty();
+ any_change = any_change || !to_cancel->empty() || !to_cancel_upmap_primary_only->empty();
return any_change;
}
CephContext *cct,
Incremental *pending_inc,
const vector<pg_t>& to_cancel,
+ const vector<pg_t>& to_cancel_upmap_primary_only,
+ const set<uint64_t>& affected_pools,
const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
{
for (auto &pg: to_cancel) {
<< dendl;
pending_inc->old_pg_upmap.insert(pg);
}
+ auto k = pending_inc->new_pg_upmap_primary.find(pg);
+ if (k != pending_inc->new_pg_upmap_primary.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap_primaries entry "
+ << k->first << "->" << k->second
+ << dendl;
+ pending_inc->new_pg_upmap_primary.erase(k);
+ }
+ auto l = pg_upmap_primaries.find(pg);
+ if (l != pg_upmap_primaries.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pg_upmap_primaries entry "
+ << l->first << "->" << l->second
+ << dendl;
+ pending_inc->old_pg_upmap_primary.insert(pg);
+ }
auto p = pending_inc->new_pg_upmap_items.find(pg);
if (p != pending_inc->new_pg_upmap_items.end()) {
ldout(cct, 10) << __func__ << " cancel invalid pending "
}
for (auto& i : to_remap)
pending_inc->new_pg_upmap_items[i.first] = i.second;
+
+ // Cancel mappings that are only invalid for pg_upmap_primary.
+ // For example, if a selected primary OSD does not exist
+ // in that PG's up set, it should be canceled. But this
+ // could be valid for pg_upmap/pg_upmap_items.
+ for (auto &pg_prim: to_cancel_upmap_primary_only) {
+ auto k = pending_inc->new_pg_upmap_primary.find(pg_prim);
+ if (k != pending_inc->new_pg_upmap_primary.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pending "
+ << "pg_upmap_primaries entry "
+ << k->first << "->" << k->second
+ << dendl;
+ pending_inc->new_pg_upmap_primary.erase(k);
+ }
+ auto l = pg_upmap_primaries.find(pg_prim);
+ if (l != pg_upmap_primaries.end()) {
+ ldout(cct, 10) << __func__ << " cancel invalid pg_upmap_primaries entry "
+ << l->first << "->" << l->second
+ << dendl;
+ pending_inc->old_pg_upmap_primary.insert(pg_prim);
+ }
+ }
+
+ // Clean all pg_upmap_primary entries where the pool size was changed,
+ // as old records no longer make sense optimization-wise.
+ for (auto pid : affected_pools) {
+ ldout(cct, 10) << __func__ << " cancel all pg_upmap_primaries for pool " << pid
+ << " since pg_num changed" << dendl;
+ rm_all_upmap_prims(cct, pending_inc, pid);
+ }
}
bool OSDMap::clean_pg_upmaps(
ldout(cct, 10) << __func__ << dendl;
vector<pg_t> to_check;
vector<pg_t> to_cancel;
+ vector<pg_t> to_cancel_upmap_primary_only;
+ set<uint64_t> affected_pools;
map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
get_upmap_pgs(&to_check);
- auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
- clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
- //TODO: Create these 3 functions for pg_upmap_primaries and so they can be checked
- // and cleaned in the same way as pg_upmap. This is not critical since invalid
- // pg_upmap_primaries are never applied, (the final check is in _apply_upmap).
+ auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_cancel_upmap_primary_only,
+ &affected_pools, &to_remap);
+ clean_pg_upmaps(cct, pending_inc, to_cancel, to_cancel_upmap_primary_only, affected_pools, to_remap);
return any_change;
}
return num_changes;
}
-void OSDMap::rm_all_upmap_prims(CephContext *cct, OSDMap::Incremental *pending_inc, uint64_t pid) {
+void OSDMap::rm_all_upmap_prims(
+ CephContext *cct,
+ OSDMap::Incremental *pending_inc,
+ uint64_t pid) const
+{
map<uint64_t,set<pg_t>> prim_pgs_by_osd;
get_pgs_by_osd(cct, pid, &prim_pgs_by_osd);
for (auto &[_, pgs] : prim_pgs_by_osd) {
for (auto &pg : pgs) {
if (pending_inc->new_pg_upmap_primary.contains(pg)) {
- ldout(cct,30) << __func__ << "Removing pending pg_upmap_prim for pg " << pg << dendl;
+ ldout(cct, 30) << __func__ << " Removing pending pg_upmap_prim for pg " << pg << dendl;
pending_inc->new_pg_upmap_primary.erase(pg);
}
if (pg_upmap_primaries.contains(pg)) {
- ldout(cct, 30) << __func__ << "Removing pg_upmap_prim for pg " << pg << dendl;
+ ldout(cct, 30) << __func__ << " Removing pg_upmap_prim for pg " << pg << dendl;
pending_inc->old_pg_upmap_primary.insert(pg);
}
}
void OSDMap::rm_all_upmap_prims(
CephContext *cct,
- OSDMap::Incremental *pending_inc)
+ OSDMap::Incremental *pending_inc) const
{
for (const auto& [pg, _] : pg_upmap_primaries) {
if (pending_inc->new_pg_upmap_primary.contains(pg)) {
OSDMap tmp_osd_map;
tmp_osd_map.deepish_copy_from(*this);
+ // Set up map to return
+ map<uint64_t,set<pg_t>> pgs_by_osd;
+
// Get the pool from the provided pool id
const pg_pool_t* pool = get_pg_pool(pid);
+ if (!pool) {
+ ldout(cct, 20) << __func__ << " pool " << pid
+ << " does not exist" << dendl;
+ return pgs_by_osd;
+ }
// build array of pgs from the pool
- map<uint64_t,set<pg_t>> pgs_by_osd;
for (unsigned ps = 0; ps < pool->get_pg_num(); ++ps) {
pg_t pg(ps, pid);
vector<int> up;
*/
uint64_t get_up_osd_features() const;
+ int get_num_pg_upmap_primaries() const { return pg_upmap_primaries.size(); };
void get_upmap_pgs(std::vector<pg_t> *upmap_pgs) const;
bool check_pg_upmaps(
CephContext *cct,
const std::vector<pg_t>& to_check,
std::vector<pg_t> *to_cancel,
+ std::vector<pg_t> *to_cancel_upmap_primary_only,
+ std::set<uint64_t> *affected_pools,
std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> *to_remap) const;
void clean_pg_upmaps(
CephContext *cct,
Incremental *pending_inc,
const std::vector<pg_t>& to_cancel,
+ const std::vector<pg_t>& to_cancel_upmap_primary_only,
+ const std::set<uint64_t>& affected_pools,
const std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>>& to_remap) const;
bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
OSDMap& tmp_osd_map,
const std::optional<rb_policy>& rbp = std::nullopt) const;
- void rm_all_upmap_prims(CephContext *cct, Incremental *pending_inc, uint64_t pid); // per pool
void rm_all_upmap_prims(
CephContext *cct,
- OSDMap::Incremental *pending_inc); // total
+ Incremental *pending_inc,
+ uint64_t pid) const; // per pool
+ void rm_all_upmap_prims(
+ CephContext *cct,
+ OSDMap::Incremental *pending_inc) const; // total
int calc_desired_primary_distribution(
CephContext *cct,
}
}
+TEST_F(OSDMapTest, CleanPGUpmapPrimaries) {
+ // Set up a default osdmap
+ set_up_map();
+
+ // Arrange CRUSH topology with 3 hosts; 2 OSDs per host
+ const int EXPECTED_HOST_NUM = 3;
+ int osd_per_host = get_num_osds() / EXPECTED_HOST_NUM;
+ ASSERT_GE(osd_per_host, 2);
+ int index = 0;
+ for (int i = 0; i < (int)get_num_osds(); i++) {
+ if (i && i % osd_per_host == 0) {
+ ++index;
+ }
+ stringstream osd_name;
+ stringstream host_name;
+ vector<string> move_to;
+ osd_name << "osd." << i;
+ host_name << "host-" << index;
+ move_to.push_back("root=default");
+ string host_loc = "host=" + host_name.str();
+ move_to.push_back(host_loc);
+ int r = crush_move(osdmap, osd_name.str(), move_to);
+ ASSERT_EQ(0, r);
+ }
+
+ // Create a replicated CRUSH rule
+ const string upmap_primary_rule = "upmap_primary";
+ int upmap_primary_rule_no = crush_rule_create_replicated(
+ upmap_primary_rule, "default", "host");
+ ASSERT_LT(0, upmap_primary_rule_no);
+
+ // Create a replicated pool (size 3) with 64 PGs
+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+ new_pool_inc.new_pool_max = osdmap.get_pool_max();
+ new_pool_inc.fsid = osdmap.get_fsid();
+ pg_pool_t empty;
+ uint64_t upmap_primary_pool_id = ++new_pool_inc.new_pool_max;
+ pg_pool_t *p = new_pool_inc.get_new_pool(upmap_primary_pool_id, &empty);
+ p->size = 3;
+ p->set_pg_num(64);
+ p->set_pgp_num(64);
+ p->type = pg_pool_t::TYPE_REPLICATED;
+ p->crush_rule = upmap_primary_rule_no;
+ p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ new_pool_inc.new_pool_names[upmap_primary_pool_id] = "upmap_primary_pool";
+ osdmap.apply_incremental(new_pool_inc);
+
+ {
+ // Sanity check - we shouldn't have any pg_upmap_primary entries yet
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0);
+
+ // Create valid pg_upmap_primary mappings on the "upmap_primary_pool"
+ OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+ OSDMap tmp_osd_map;
+ tmp_osd_map.deepish_copy_from(osdmap);
+ int num_changes = osdmap.balance_primaries(g_ceph_context, upmap_primary_pool_id,
+ &pending_inc, tmp_osd_map);
+ osdmap.apply_incremental(pending_inc);
+
+ // Check for mappings; `balance_primaries` should have created 10 pg_upmap_primary
+ // mappings for this configuration
+ ASSERT_EQ(num_changes, 10);
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10);
+ }
+
+ // -------TEST 1: Reduce number of PGs --------------
+ // For the sake of the unit test, we will simulate reducing the pg_num by removing
+ // the current pool and swapping it out with a new pool that has all the same info,
+ // except a decreased pg_num
+ {
+ // Remove current pool
+ OSDMap::Incremental remove_pool_inc(osdmap.get_epoch() + 1);
+ remove_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("upmap_primary_pool"));
+ osdmap.apply_incremental(remove_pool_inc);
+
+ // Add same pool back in with reduced pg_num (64 --> 16)
+ OSDMap::Incremental reduced_pool_inc(osdmap.get_epoch() + 1);
+ pg_pool_t new_empty;
+ pg_pool_t *reduced_pool = reduced_pool_inc.get_new_pool(upmap_primary_pool_id, &new_empty);
+ reduced_pool->size = 3;
+ reduced_pool->set_pg_num(16);
+ reduced_pool->set_pgp_num(16);
+ reduced_pool->type = pg_pool_t::TYPE_REPLICATED;
+ reduced_pool->crush_rule = upmap_primary_rule_no;
+ reduced_pool->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+ reduced_pool_inc.new_pool_names[upmap_primary_pool_id] = "upmap_primary_pool";
+ osdmap.apply_incremental(reduced_pool_inc);
+ }
+ {
+ // Clean invalid pg_upmap_primary entries
+ OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1);
+ clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc);
+ osdmap.apply_incremental(cleanup_inc);
+
+ // Ensure that all pg_upmap_primary mappings have been removed
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0);
+ }
+
+ // -------TEST 2: Create Invalid Mapping --------------
+ {
+ // Create 10 valid pg_upmap_primary mappings on the "upmap_primary_pool".
+ // Valid mappings are where a new primary is set to an OSD that already exists
+ // in the PG up set but which is not the current primary OSD.
+ OSDMap::Incremental valid_mappings_inc(osdmap.get_epoch() + 1);
+ std::cout << osdmap << std::endl;
+ for (int pg_num = 0; pg_num < 10; ++pg_num) {
+ pg_t rawpg(pg_num, upmap_primary_pool_id);
+ pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+ vector<int> raw, up;
+ osdmap.pg_to_raw_upmap(rawpg, &raw, &up);
+ int new_primary = raw[1]; // pick the second OSD to be the new prim (this is valid!)
+ ASSERT_NE(raw[0], new_primary);
+ valid_mappings_inc.new_pg_upmap_primary[pgid] = new_primary;
+ }
+ osdmap.apply_incremental(valid_mappings_inc);
+
+ // Check for 10 pg_upmap_primary mappings.
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10);
+ }
+ {
+ // Make an illegal mapping on the next PG by setting the primary
+ // to an OSD that exists outside the up set.
+ OSDMap::Incremental illegal_mapping_inc(osdmap.get_epoch() +1);
+ pg_t rawpg(10, upmap_primary_pool_id);
+ pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+ vector<int> raw, up;
+ osdmap.pg_to_raw_upmap(rawpg, &raw, &up);
+ int new_primary = 2;
+ for (int osd : raw) {
+ ASSERT_NE(osd, new_primary);
+ }
+ illegal_mapping_inc.new_pg_upmap_primary[pgid] = new_primary;
+ osdmap.apply_incremental(illegal_mapping_inc);
+ }
+ {
+ // Check for 11 mappings (10 legal, 1 illegal)
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 11);
+
+ // Clean invalid pg_upmap_primary entry
+ OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1);
+ clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc);
+ osdmap.apply_incremental(cleanup_inc);
+
+ // Ensure that the illegal mapping was removed
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 10);
+ }
+
+ // -------TEST 3: Mark an OSD Out --------------
+ {
+ // mark one of the OSDs out
+ OSDMap::Incremental out_osd_inc(osdmap.get_epoch() + 1);
+ out_osd_inc.new_weight[0] = CEPH_OSD_OUT;
+ osdmap.apply_incremental(out_osd_inc);
+ ASSERT_TRUE(osdmap.is_out(0));
+ std::cout << osdmap << std::endl;
+ }
+ {
+ // Clean mappings for out OSDs
+ OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1);
+ clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc);
+ osdmap.apply_incremental(cleanup_inc);
+
+ // Ensure that mappings with out OSDs were removed
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7);
+ }
+
+ // -------TEST 4: Create redundant mapping -----
+ {
+ // Make a redundant mapping on the next PG by setting the primary
+ // to an OSD that is already primary.
+ OSDMap::Incremental redundant_mapping_inc(osdmap.get_epoch() +1);
+ pg_t rawpg(10, upmap_primary_pool_id);
+ pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+ vector<int> raw, up;
+ osdmap.pg_to_raw_upmap(rawpg, &raw, &up);
+ int new_primary = 3;
+ ASSERT_EQ(raw[0], new_primary);
+ redundant_mapping_inc.new_pg_upmap_primary[pgid] = new_primary;
+ osdmap.apply_incremental(redundant_mapping_inc);
+ }
+ {
+ // Check for 8 mappings (7 legal, 1 illegal)
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 8);
+
+ // Clean invalid pg_upmap_primary entry
+ OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1);
+ clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc);
+ osdmap.apply_incremental(cleanup_inc);
+
+ // Ensure that the illegal mapping was removed
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7);
+ }
+
+ // -------TEST 5: Delete the pool --------------
+ {
+ // Delete "upmap_primary_pool"
+ OSDMap::Incremental delete_pool_inc(osdmap.get_epoch() + 1);
+ delete_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("upmap_primary_pool"));
+ osdmap.apply_incremental(delete_pool_inc);
+ }
+ {
+ // Verify that mappings (now invalid) still exist before cleanup
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 7);
+
+ // Clean invalid pg_upmap_primary entries
+ OSDMap::Incremental cleanup_inc(osdmap.get_epoch() + 1);
+ clean_pg_upmaps(g_ceph_context, osdmap, cleanup_inc);
+ osdmap.apply_incremental(cleanup_inc);
+
+ // Ensure that all pg_upmap_primary mappings have been removed
+ ASSERT_EQ(osdmap.get_num_pg_upmap_primaries(), 0);
+ }
+}
+
TEST_F(OSDMapTest, BUG_38897) {
// http://tracker.ceph.com/issues/38897
// build a fresh map with 12 OSDs, without any default pools
}
ss << std::endl;
}
+ for (auto& i : pending_inc.old_pg_upmap_primary) {
+ if (vstart)
+ ss << prefix;
+ ss << cmd + " osd rm-pg-upmap-primary " << i << std::endl;
+ }
for (auto& i : pending_inc.new_pg_upmap_primary) {
if (vstart)
ss << prefix;