op->put();
spg_t pgid;
- if (m->get_type() == CEPH_MSG_OSD_OP) {
+ if (m->get_type() == CEPH_MSG_OSD_OP &&
+ !m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT)) {
pg_t actual_pgid = osdmap->raw_pg_to_pg(
static_cast<const MOSDOp*>(m)->get_pg());
if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
}
} else {
pgid = m->get_spg();
+ //Pre-tentacle clients encode the shard_id_t incorrectly for optimized EC
+ //pools with pg_temp set. Correct the mistake here.
+ if (!m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE)) {
+ auto pi = osdmap->get_pg_pool(pgid.pool());
+ pgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, pgid.pgid, pgid.shard));
+ }
}
enqueue_op(pgid, std::move(op), m->get_map_epoch());
}
service.maybe_inject_dispatch_delay();
- if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
- m->get_type() != CEPH_MSG_OSD_OP) {
+ // Pre-tentacle clients sending requests to EC shards other than 0 may
+ // set the shard incorrectly because of how pg_temp encodes primary
+ // shards first. These requests need to be routed through
+ // dispatch_session_waiting which uses the OSDMap to correct the shard.
+ bool legacy = !m->get_connection()->has_features(CEPH_FEATUREMASK_SERVER_TENTACLE);
+ spg_t spg = static_cast<MOSDFastDispatchOp*>(m)->get_spg();
+ if (legacy) {
+ // Optimization - replica pools and EC shard 0 are never remapped
+ if ((spg.shard == shard_id_t::NO_SHARD) ||
+ (spg.shard == shard_id_t(0))) {
+ legacy = false;
+ }
+ }
+ if (!legacy &&
+ (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
+ m->get_type() != CEPH_MSG_OSD_OP)) {
// queue it directly
enqueue_op(
- static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
+ spg,
std::move(op),
static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
} else {
return acting;
}
+const shard_id_t OSDMap::pgtemp_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg, const shard_id_t shard) const
+{
+ if ((shard == shard_id_t::NO_SHARD) ||
+ (shard == shard_id_t(0))) {
+ return shard;
+ }
+ shard_id_t result = shard;
+ if (pool.allows_ecoptimizations()) {
+ if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+ int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+ if (shard >= pool.size - num_parity_shards) {
+ result = shard_id_t(result + num_parity_shards + 1 - pool.size);
+ } else {
+ result = shard_id_t(result + num_parity_shards);
+ }
+ }
+ }
+ return result;
+}
+
+shard_id_t OSDMap::pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg, const shard_id_t shard) const
+{
+ if ((shard == shard_id_t::NO_SHARD) ||
+ (shard == shard_id_t(0))) {
+ return shard;
+ }
+ shard_id_t result = shard;
+ if (pool.allows_ecoptimizations()) {
+ if (pg_temp->find(pool.raw_pg_to_pg(pg)) != pg_temp->end()) {
+ int num_parity_shards = pool.size - pool.nonprimary_shards.size() - 1;
+ if (shard > num_parity_shards) {
+ result = shard_id_t(result - num_parity_shards);
+ } else {
+ result = shard_id_t(result + pool.size - num_parity_shards - 1);
+ }
+ }
+ }
+ return result;
+}
+
void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
vector<int> *temp_pg, int *temp_primary) const
{
return false;
}
bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
- auto i = get_pools().find(pgid.pool());
- if (i == get_pools().end()) {
+ auto poolit = get_pools().find(pgid.pool());
+ if (poolit == get_pools().end()) {
return false;
}
std::vector<int> acting;
pg_to_acting_osds(pgid, &acting, primary);
- if (i->second.is_erasure()) {
+ if (poolit->second.is_erasure()) {
for (uint8_t i = 0; i < acting.size(); ++i) {
if (acting[i] == *primary) {
- *out = spg_t(pgid, shard_id_t(i));
+ *out = spg_t(pgid, pgtemp_undo_primaryfirst(poolit->second, pgid, shard_id_t(i)));
return true;
}
}
const std::vector<int> pgtemp_undo_primaryfirst(const pg_pool_t& pool,
const pg_t pg,
const std::vector<int>& acting) const;
+ const shard_id_t pgtemp_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg,
+ const shard_id_t shard) const;
+ shard_id_t pgtemp_undo_primaryfirst(const pg_pool_t& pool,
+ const pg_t pg,
+ const shard_id_t shard) const;
bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
auto p = removed_snaps_queue.find(pool);
if (pi->is_erasure()) {
for (uint8_t i = 0; i < t->acting.size(); ++i) {
if (t->acting[i] == acting_primary) {
- spgid.reset_shard(shard_id_t(i));
+ spgid.reset_shard(osdmap->pgtemp_undo_primaryfirst(*pi, actual_pgid, shard_id_t(i)));
break;
}
}
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
ASSERT_EQ(set, decoded);
- // With nonprimary_shards
- for (int seed = 1; seed < 64; seed++) {
+ // With nonprimary_shards, shard 0 is never a non-primary
+ for (int seed = 2; seed < 64; seed += 2) {
for (int osd = 0; osd < 6; osd++ ) {
if (seed & (1 << osd)) {
pool.nonprimary_shards.insert(shard_id_t(osd));
// non-primary shards last
ASSERT_TRUE(pool.is_nonprimary_shard(shard_id_t(encoded[osd])));
}
+ std::cout << osd << " " << seed << " " << osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)) << std::endl;
+ // Encode and decode should be equivalent
+ ASSERT_EQ(osdmap.pgtemp_undo_primaryfirst(pool, pgid,
+ osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd))),
+ shard_id_t(osd));
+ // Shard 0 never changes, seed 62 is a special case because all the other
+ // shards are non-primary
+ if ((osd != 0) && (seed != 62)) {
+ // Encode should be different
+ ASSERT_NE(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+ shard_id_t(osd));
+ } else {
+ // Encode should not change
+ ASSERT_EQ(osdmap.pgtemp_primaryfirst(pool, pgid, shard_id_t(osd)),
+ shard_id_t(osd));
+ }
}
decoded = osdmap.pgtemp_undo_primaryfirst(pool, pgid, encoded);
ASSERT_EQ(set, decoded);