This is a collection of minor changes to make scrub compatible with new EC.
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
op.recovery_info.oi = op.obc->obs.oi;
}
- if (sinfo.require_hinfo()) {
+ if (sinfo.get_is_hinfo_required()) {
ECUtil::HashInfo hinfo(sinfo.get_k_plus_m());
if (op.obc->obs.oi.size > 0) {
ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
if (op.recovery_progress.first && op.obc) {
op.xattrs = op.obc->attr_cache;
- if (sinfo.require_hinfo()) {
+ if (sinfo.get_is_hinfo_required()) {
if (auto [r, attrs, size] = ecbackend->get_attrs_n_size_from_disk(
op.hoid);
r >= 0 || r == -ENOENT) {
o.read_error = true;
return 0;
}
- if (bl.length() % sinfo.get_chunk_size()) {
- dout(20) << __func__ << " " << poid << " got "
- << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned"
- << dendl;
- o.read_error = true;
- return 0;
- }
if (r > 0) {
pos.data_hash << bl;
}
return -EINPROGRESS;
}
+ if (!sinfo.get_is_hinfo_required()) {
+ o.digest = 0;
+ o.digest_present = true;
+ o.omap_digest = -1;
+ o.omap_digest_present = true;
+ return 0;
+ }
+
ECUtil::HashInfoRef hinfo = unstable_hashinfo_registry.get_hash_info(
poid, false, o.attrs, o.size);
if (!hinfo) {
o.read_error = true;
o.digest_present = false;
return 0;
- } else {
- if (!sinfo.supports_ec_overwrites()) {
- if (!hinfo->has_chunk_hash()) {
- dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl;
- o.ec_size_mismatch = true;
- return 0;
- }
- if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
- dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
- << std::hex << pos
- << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
- << dendl;
- o.ec_size_mismatch = true;
- return 0;
- }
-
- if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
- pos.data_hash.digest()) {
- dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
- << std::hex << pos.data_hash.digest() << " != expected 0x"
- << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
- << std::dec << dendl;
- o.ec_hash_mismatch = true;
- return 0;
- }
+ }
+ if (!hinfo->has_chunk_hash()) {
+ dout(0) << "_scan_list " << poid << " got invalid hash info" << dendl;
+ o.ec_size_mismatch = true;
+ return 0;
+ }
+ if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
+ dout(0) << "_scan_list " << poid << " got incorrect size on read 0x"
+ << std::hex << pos
+ << " expected 0x" << hinfo->get_total_chunk_size() << std::dec
+ << dendl;
+ o.ec_size_mismatch = true;
+ return 0;
+ }
- /* We checked above that we match our own stored hash. We cannot
- * send a hash of the actual object, so instead we simply send
- * our locally stored hash of shard 0 on the assumption that if
- * we match our chunk hash and our recollection of the hash for
- * chunk 0 matches that of our peers, there is likely no corruption.
- */
- o.digest = hinfo->get_chunk_hash(shard_id_t(0));
- o.digest_present = true;
- } else {
- /* Hack! We must be using partial overwrites, and partial overwrites
- * don't support deep-scrub yet
- */
- o.digest = 0;
- o.digest_present = true;
- }
+ if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
+ pos.data_hash.digest()) {
+ dout(0) << "_scan_list " << poid << " got incorrect hash on read 0x"
+ << std::hex << pos.data_hash.digest() << " != expected 0x"
+ << hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
+ << std::dec << dendl;
+ o.ec_hash_mismatch = true;
+ return 0;
}
+ /* We checked above that we match our own stored hash. We cannot
+ * send a hash of the actual object, so instead we simply send
+ * our locally stored hash of shard 0 on the assumption that if
+ * we match our chunk hash and our recollection of the hash for
+ * chunk 0 matches that of our peers, there is likely no corruption.
+ */
+ o.digest = hinfo->get_chunk_hash(shard_id_t(0));
+ o.digest_present = true;
o.omap_digest = -1;
o.omap_digest_present = true;
return 0;
return sinfo.object_size_to_shard_size(size, shard);
}
+ uint64_t get_is_nonprimary_shard(shard_id_t shard) const {
+ return sinfo.is_nonprimary_shard(shard);
+ }
+
+ bool get_is_hinfo_required() const {
+ return sinfo.get_is_hinfo_required();
+ }
+
/**
* ECReadPred
*
return legacy.object_size_to_shard_size(size);
// All shards are the same size.
}
+ bool get_is_nonprimary_shard(shard_id_t shard) const final {
+ if (is_optimized()) {
+ return optimized.get_is_nonprimary_shard(shard);
+ }
+ return false;
+ }
+ bool get_is_hinfo_required() const final {
+ if (is_optimized()) {
+ return optimized.get_is_hinfo_required();
+ }
+ return true;
+ }
+ bool get_is_ec_optimized() const final {
+ return is_optimized();
+ }
};
ErasureCodeInterface::FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS) != 0;
}
- bool require_hinfo() const {
+ bool get_is_hinfo_required() const {
return !supports_ec_overwrites();
}
return !waiting_for_unreadable_object.empty();
}
+ bool get_is_nonprimary_shard(const pg_shard_t &shard) const final
+ {
+ return get_pgbackend()->get_is_nonprimary_shard(shard.shard);
+ }
+
+ bool get_is_hinfo_required() const final
+ {
+ return get_pgbackend()->get_is_hinfo_required();
+ }
+
+ bool get_is_ec_optimized() const final {
+ return get_pgbackend()->get_is_ec_optimized();
+ }
+
static void set_last_scrub_stamp(
utime_t t, pg_history_t &history, pg_stat_t &stats) {
stats.last_scrub_stamp = t;
}
uint64_t logical_to_ondisk_size(uint64_t logical_size,
- int8_t shard_id) const final {
+ shard_id_t shard_id) const final {
return get_pgbackend()->be_get_ondisk_size(logical_size, shard_id_t(shard_id));
}
};
virtual int get_ec_stripe_chunk_size() const { return 0; };
virtual uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { return size; };
virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+ virtual bool get_is_nonprimary_shard(shard_id_t shard) const {
+ return false; // Only EC has nonprimary shards.
+ };
+ virtual bool get_is_hinfo_required() const {
+ return false; // Only EC can have hinfo.
+ }
+ virtual bool get_is_ec_optimized() const {
+ return false; // Only EC can have be ec optimized!
+ }
private:
std::set<hobject_t> temp_contents;
<< " " << oi.alloc_hint_flags << "]";
if (oi.has_manifest())
out << " " << oi.manifest;
+ if (!oi.shard_versions.empty())
+ out << " shard_versions=" << oi.shard_versions;
out << ")";
return out;
}
clear_omap_digest();
}
+ eversion_t get_version_for_shard(shard_id_t shard) const {
+ auto iter = shard_versions.find(shard);
+
+ // If the shard_versions is not included, then it is the same as this.
+ if (iter == shard_versions.end()) {
+ return version;
+ }
+ // Otherwise, the shard_versions should be fully populated.
+ return iter->second;
+ }
+
void encode(ceph::buffer::list& bl, uint64_t features) const;
void decode(ceph::buffer::list::const_iterator& bl);
void decode(const ceph::buffer::list& bl) {
}
uint64_t ScrubBackend::logical_to_ondisk_size(uint64_t logical_size,
- int8_t shard_id) const
+ shard_id_t shard_id) const
{
return m_pg.logical_to_ondisk_size(logical_size, shard_id);
}
// ok, do the pg-type specific scrubbing
// (Validates consistency of the object info and snap sets)
- scrub_snapshot_metadata(for_meta_scrub);
+ scrub_snapshot_metadata(for_meta_scrub, m_pg_whoami);
return objs_fix_list_t{std::move(this_chunk->m_inconsistent_objs),
scan_snaps(for_meta_scrub, snaps_getter)};
// do not emit the returned error message to the log
dout(15) << fmt::format("{}: {} not found on shard {}", __func__, ho, l)
<< dendl;
+ } else if (shard_ret.possible_auth == shard_as_auth_t::usable_t::not_usable_no_err) {
+ dout(20) << fmt::format("{}: skipping not_usable_no_err {} {} {}",
+ __func__,
+ l,
+ shard_ret.oi.version,
+ shard_ret.oi.soid)
+ << dendl;
} else {
dout(30) << fmt::format("{}: consider using {} srv: {} oi soid: {}",
// We won't pick an auth copy if the snapset is missing or won't decode.
ceph_assert(!obj.is_snapdir());
- if (obj.is_head()) {
+ if (obj.is_head() && !m_pg.get_is_nonprimary_shard(j_shard)) {
auto k = smap_obj.attrs.find(SS_ATTR);
if (dup_error_cond(err,
false,
}
}
- if (!m_is_replicated) {
+ if (m_pg.get_is_hinfo_required()) {
auto k = smap_obj.attrs.find(ECUtil::get_hinfo_key());
if (dup_error_cond(err,
false,
}
}
- uint64_t ondisk_size = logical_to_ondisk_size(oi.size, srd.shard.id);
+ uint64_t ondisk_size = logical_to_ondisk_size(oi.size, srd.shard);
if (test_error_cond(smap_obj.size != ondisk_size, shard_info,
&shard_info_wrapper::set_obj_size_info_mismatch)) {
ceph_assert(!err);
// note that the error text is made available to the caller, even
- // for a successful shard selection
- return shard_as_auth_t{oi, j, errstream.str(), digest};
+ // for a successful shard selection.
+ // Non-primary shards cannot be used as authoritative, but this is not
+ // considered a failure.
+ return shard_as_auth_t{oi, j, errstream.str(), digest,
+ m_pg.get_is_nonprimary_shard(j_shard)};
}
// re-implementation of PGBackend::be_compare_scrubmaps()
if (!can_bl.contents_equal(auth_bl)) {
object_info_t oi(can_bl);
fmt::format_to(std::back_inserter(out),
- "{}object info inconsistent auth_io={} candidate_oi={}",
+ "{}object info inconsistent auth_oi={} candidate_oi={}",
sep(error), auth_oi, oi);
obj_result.set_object_info_inconsistency();
}
if (oi.version != auth_oi.get_version_for_shard(shard.shard) ||
oi.size != auth_oi.size) {
fmt::format_to(std::back_inserter(out),
- "{}object info version incorrect auth_io={} candidate_oi={}",
+ "{}object info version incorrect auth_oi={} candidate_oi={}",
sep(error), auth_oi, oi);
obj_result.set_object_info_inconsistency();
}
}
}
- if (has_snapset) {
+ if (has_snapset && !m_pg.get_is_nonprimary_shard(shard)) {
if (!shard_result.has_snapset_missing() &&
!shard_result.has_snapset_corrupted()) {
// ------------------------------------------------------------------------
- if (!m_is_replicated) {
+ // Only EC can have hinfo
+ // There the below if statement will only be entered true for EC objects
+ if (m_pg.get_is_hinfo_required() && !m_pg.get_is_nonprimary_shard(shard)) {
if (!shard_result.has_hinfo_missing() &&
!shard_result.has_hinfo_corrupted()) {
// ------------------------------------------------------------------------
// sizes:
- // NOTE: This will be fixed as a later PR as part of the optimized EC work.
- uint64_t oi_size = logical_to_ondisk_size(auth_oi.size, 0);
+ uint64_t oi_size = logical_to_ondisk_size(auth_oi.size, shard.shard);
if (oi_size != candidate.size) {
fmt::format_to(std::back_inserter(out),
"{}size {} != size {} from auth oi {}",
shard_result.set_size_mismatch_info();
}
- if (auth.size != candidate.size) {
+ // In optimized EC, the different shards are of different sizes, so this test
+ // does not work. All sizes should have been checked above.
+ if (!m_pg.get_is_ec_optimized() && auth.size != candidate.size) {
fmt::format_to(std::back_inserter(out),
"{}size {} != size {} from shard {}",
sep(error),
// ------------------------------------------------------------------------
// comparing the attributes:
+ // Other than OI, Only potential primaries have the attribues.
- for (const auto& [k, v] : auth.attrs) {
- if (k == OI_ATTR || k[0] != '_') {
- // We check system keys separately
- continue;
- }
+ if (!m_pg.get_is_nonprimary_shard(shard)) {
+ for (const auto& [k, v] : auth.attrs) {
+ if (k == OI_ATTR || k[0] != '_') {
+ // We check system keys separately
+ continue;
+ }
- auto cand = candidate.attrs.find(k);
- if (cand == candidate.attrs.end()) {
- fmt::format_to(std::back_inserter(out),
- "{}attr name mismatch '{}'",
- sep(error),
- k);
- obj_result.set_attr_name_mismatch();
- } else if (!cand->second.contents_equal(v)) {
- fmt::format_to(std::back_inserter(out),
- "{}attr value mismatch '{}'",
- sep(error),
- k);
- obj_result.set_attr_value_mismatch();
+ auto cand = candidate.attrs.find(k);
+ if (cand == candidate.attrs.end()) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr name mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_name_mismatch();
+ } else if (!cand->second.contents_equal(v)) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr value mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_value_mismatch();
+ }
}
- }
- for (const auto& [k, v] : candidate.attrs) {
- if (k == OI_ATTR || k[0] != '_') {
- // We check system keys separately
- continue;
- }
+ for (const auto& [k, v] : candidate.attrs) {
+ if (k == OI_ATTR || k[0] != '_') {
+ // We check system keys separately
+ continue;
+ }
- auto in_auth = auth.attrs.find(k);
- if (in_auth == auth.attrs.end()) {
- fmt::format_to(std::back_inserter(out),
- "{}attr name mismatch '{}'",
- sep(error),
- k);
- obj_result.set_attr_name_mismatch();
+ auto in_auth = auth.attrs.find(k);
+ if (in_auth == auth.attrs.end()) {
+ fmt::format_to(std::back_inserter(out),
+ "{}attr name mismatch '{}'",
+ sep(error),
+ k);
+ obj_result.set_attr_name_mismatch();
+ }
}
}
* [Snapset clones 4]
* EOL obj4 snap 4, (expected)
*/
-void ScrubBackend::scrub_snapshot_metadata(ScrubMap& map)
+void ScrubBackend::scrub_snapshot_metadata(ScrubMap& map, const pg_shard_t &srd)
{
dout(10) << __func__ << " num stat obj "
<< m_pg.get_pg_info(ScrubberPasskey{}).stats.stats.sum.num_objects
}
if (oi) {
- // NOTE: Fix planned as part of the optimized EC work.
- if (logical_to_ondisk_size(oi->size, 0) != p->second.size) {
+ if (logical_to_ondisk_size(oi->size, srd.shard) != p->second.size) {
clog.error() << m_mode_desc << " " << m_pg_id << " " << soid
<< " : on disk size (" << p->second.size
<< ") does not match object info size (" << oi->size
<< ") adjusted for ondisk to ("
- << logical_to_ondisk_size(oi->size, 0) << ")";
+ << logical_to_ondisk_size(oi->size, srd.shard) << ")";
soid_error.set_size_mismatch();
this_chunk->m_error_counts.shallow_errors++;
}
struct shard_as_auth_t {
// note: 'not_found' differs from 'not_usable' in that 'not_found'
// does not carry an error message to be cluster-logged.
- enum class usable_t : uint8_t { not_usable, not_found, usable };
+ enum class usable_t : uint8_t { not_usable, not_found, usable, not_usable_no_err };
// the ctor used when the shard should not be considered as auth
explicit shard_as_auth_t(std::string err_msg)
shard_as_auth_t(const object_info_t& anoi,
shard_to_scrubmap_t::iterator it,
std::string err_msg,
- std::optional<uint32_t> data_digest)
- : possible_auth{usable_t::usable}
+ std::optional<uint32_t> data_digest,
+ bool nonprimary_ec)
+ : possible_auth{nonprimary_ec?usable_t::not_usable_no_err:usable_t::usable}
, error_text{err_msg}
, oi{anoi}
, auth_iter{it}
if (as_auth.possible_auth == shard_as_auth_t::usable_t::not_found) {
return fmt::format_to(ctx.out(), "{{shard-not-found}}");
}
+ if (as_auth.possible_auth == shard_as_auth_t::usable_t::not_usable_no_err) {
+ return fmt::format_to(ctx.out(),
+ "{{shard-not-usable-no-err:{}}}",
+ as_auth.error_text);
+ }
return fmt::format_to(ctx.out(),
"{{shard-usable: soid:{} {{txt:{}}} }}",
as_auth.oi.soid,
const spg_t m_pg_id;
std::vector<pg_shard_t> m_acting_but_me; // primary only
bool m_is_replicated{true};
+ bool m_is_optimized_ec{false};
std::string_view m_mode_desc;
std::string m_formatted_id;
const PGPool& m_pool;
/**
* Validate consistency of the object info and snap sets.
*/
- void scrub_snapshot_metadata(ScrubMap& map);
+ void scrub_snapshot_metadata(ScrubMap& map, const pg_shard_t &srd);
/**
* Updates the "global" (i.e. - not 'per-chunk') databases:
// accessing the PG backend for this translation service
uint64_t logical_to_ondisk_size(uint64_t logical_size,
- int8_t shard_id) const;
+ shard_id_t shard_id) const;
};
namespace fmt {
// query the PG backend for the on-disk size of an object
virtual uint64_t logical_to_ondisk_size(uint64_t logical_size,
- int8_t shard_id) const = 0;
+ shard_id_t shard_id) const = 0;
// used to verify our "cleanliness" before scrubbing
virtual bool is_waiting_for_unreadable_object() const = 0;
+
+ // A non-primary shard is one which can never become primary. It may
+ // have an old version and cannot be considered authoritative.
+ virtual bool get_is_nonprimary_shard(const pg_shard_t &pg_shard) const = 0;
+
+ // hinfo objects are not used for some EC configurations. Do not raise scrub
+ // errors on hinfo if they should not exist.
+ virtual bool get_is_hinfo_required() const = 0;
+
+ // If true, the EC optimisations have been enabled.
+ virtual bool get_is_ec_optimized() const = 0;
};
// defining a specific subset of performance counters. Each of the members
const pg_info_t& get_pg_info(ScrubberPasskey) const final { return m_info; }
uint64_t logical_to_ondisk_size(uint64_t logical_size,
- int8_t shard_id) const final
+ shard_id_t shard_id) const final
{
return logical_size;
}
std::shared_ptr<PGPool> m_pool;
pg_info_t& m_info;
pg_shard_t m_pshard;
+
+ bool get_is_nonprimary_shard(const pg_shard_t &pg_shard) const final
+ {
+ return get_is_ec_optimized() &&
+ m_pool->info.is_nonprimary_shard(pg_shard.shard);
+ }
+ bool get_is_hinfo_required() const final
+ {
+ return get_is_ec_optimized() &&
+ !m_pool->info.has_flag(m_pool->info.FLAG_EC_OVERWRITES);
+ }
+ bool get_is_ec_optimized() const final {
+ return m_pool->info.has_flag(m_pool->info.FLAG_EC_OPTIMIZATIONS);
+ }
};