<< ": pool has been removed" << dendl;
continue;
}
+ // Pools with allow_ec_optimizations set store pg_temp in a different
+ // order to change the primary selection algorithm without breaking
+ // old clients. If necessary re-order the new pg_temp now
+ pg_pool_t pg_pool;
+ if (pending_inc.new_pools.count(pool))
+ pg_pool = pending_inc.new_pools[pool];
+ else
+ pg_pool = *osdmap.get_pg_pool(pool);
+
+ std::vector<int> pg_temp = osdmap.pgtemp_primaryfirst(pg_pool, p->second);
pending_inc.new_pg_temp[p->first] =
- mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
+ mempool::osdmap::vector<int>(pg_temp.begin(), pg_temp.end());
// unconditionally clear pg_primary (until this message can encode
// a change for that, too.. at which point we need to also fix
}
}
+/* EC pools with allow_ec_optimizations set have some shards that cannot
+ * become the primary because they are not updated on every I/O. To avoid
+ * requiring clients to be upgraded to use these new pools the logic in
+ * OSDMap which selects a primary cannot be changed. Instead choose_acting
+ * is modified to set pgtemp when it is necessary to override the choice
+ * of primary, and this vector is reordered so that shards that are
+ * permitted to be the primary are listed first. The existing OSDMap code
+ * will then choose a suitable shard as primary except when the pg is
+ * incomplete and the choice of primary doesn't matter. This function is
+ * called by OSDMonitor when setting pg_temp to transform the vector.
+ *
+ * Example: Optimized EC pool 4+2
+ * acting_set = {NONE, 6, 7, 8, 9, 10}
+ * non_primary_shards = {1, 2, 3} # data shards other than shard 0
+ * pg_temp = {NONE, 9, 10, 6, 7, 8} # non-primary shards at end
+ * primary will be OSD 9(1)
+ */
+const std::vector<int> OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
+ const std::vector<int>& pg_temp) const
+{
+ // Only perform the transform for pools with allow_ec_optimizations set
+ if (pool.allows_ecoptimizations()) {
+ std::vector<int> result;
+ std::vector<int> nonprimary;
+ int shard = 0;
+ for (auto osd : pg_temp) {
+ if (pool.is_nonprimary_shard(shard_id_t(shard))) {
+ nonprimary.emplace_back(osd);
+ } else {
+ result.emplace_back(osd);
+ }
+ shard++;
+ }
+ result.insert(result.end(), nonprimary.begin(), nonprimary.end());
+ return result;
+ }
+ return pg_temp;
+}
+
+/* The function above reorders the pg_temp vector. This transformation needs
+ * to be reversed by OSDs (but not clients) and is called by PeeringState
+ * when initializing the the acting set.
+ */
+const std::vector<int> OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg, const std::vector<int>& acting) const
+{
+ // Only perform the transform for pools with allow_ec_optimizations set
+ // that also have pg_temp set
+ if (pool.allows_ecoptimizations()) {
+ if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+ std::vector<int> result;
+ int primaryshard = 0;
+ int nonprimaryshard = pool.size - pool.nonprimary_shards.size();
+ assert(acting.size() == pool.size);
+ for (auto shard = 0; shard < pool.size; shard++) {
+ if (pool.is_nonprimary_shard(shard_id_t(shard))) {
+ result.emplace_back(acting[nonprimaryshard++]);
+ } else {
+ result.emplace_back(acting[primaryshard++]);
+ }
+ }
+ return result;
+ }
+ }
+ return acting;
+}
+
void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
vector<int> *temp_pg, int *temp_primary) const
{
mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
mempool::osdmap::vector<osd_info_t> osd_info;
+ // Optimized EC pools re-order pg_temp, see pgtemp_primaryfirst
std::shared_ptr<PGTempMap> pg_temp; // temp pg mapping (e.g. while we rebuild)
std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
return false;
}
+ const std::vector<int> pgtemp_primaryfirst(const pg_pool_t& pool,
+ const std::vector<int>& pg_temp) const;
+ const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg,
+ const std::vector<int>& acting) const;
+
bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
auto p = removed_snaps_queue.find(pool);
if (p == removed_snaps_queue.end()) {
int newupprimary,
int newactingprimary,
const vector<int>& newup,
- const vector<int>& newacting,
+ const vector<int>& _newacting,
OSDMapRef lastmap,
OSDMapRef osdmap)
{
+ const vector<int> newacting = osdmap->pgtemp_undo_primaryfirst(
+ pool.info,
+ info.pgid.pgid,
+ _newacting);
if (PastIntervals::is_new_interval(
primary.osd,
newactingprimary,
int new_acting_primary)
{
actingset.clear();
- acting = newacting;
+ acting = get_osdmap()->pgtemp_undo_primaryfirst(pool.info,
+ info.pgid.pgid,
+ newacting);
for (uint8_t i = 0; i < acting.size(); ++i) {
if (acting[i] != CRUSH_ITEM_NONE)
actingset.insert(
<< " from oversized want " << want << dendl;
want.pop_back();
}
- if (want != acting) {
- psdout(10) << "want " << want << " != acting " << acting
+ if ((want != acting) ||
+ pool.info.is_nonprimary_shard(pg_whoami.shard)) {
+ if (pool.info.is_nonprimary_shard(pg_whoami.shard)) {
+ psdout(10) << "shard " << pg_whoami.shard << " cannot be primary, want "
+ << pg_vector_string(want)
+ << " acting " << pg_vector_string(acting)
<< ", requesting pg_temp change" << dendl;
+ } else {
+ psdout(10) << "want " << pg_vector_string(want)
+ << " != acting " << pg_vector_string(acting)
+ << ", requesting pg_temp change" << dendl;
+ }
want_acting = want;
if (!cct->_conf->osd_debug_no_acting_change) {
- if (want_acting == up) {
+ if ((want_acting == up) &&
+ !pool.info.is_nonprimary_shard(pg_whoami.shard)) {
// There can't be any pending backfill if
// want is the same as crush map up OSDs.
ceph_assert(want_backfill.empty());