From 35ddfdeac2e0394a8300548c262fe433e8951bd6 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 20 Apr 2018 18:39:40 -0400 Subject: [PATCH] osd/ECBackend: recover from EIO based on the minimum data necessary Discount shards that already returned EIO, and use minimum_to_decode() to request just what is necessary to recover or read the originally requested extents of the object. Signed-off-by: Josh Durgin (cherry picked from commit b162a5478d6a907cc0a9ddd5ae8442e81f8d8fb3) Conflicts: src/osd/ECBackend.cc (Adjust for Luminous not having subchunks) src/osd/ECBackend.h (trivial) --- .../erasure-code/test-erasure-eio.sh | 36 +++++++++++++- src/osd/ECBackend.cc | 49 ++++++++++++++----- src/osd/ECBackend.h | 3 ++ 3 files changed, 75 insertions(+), 13 deletions(-) diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh index 62ac49dbba46b..a4abad1c9e9ad 100755 --- a/qa/standalone/erasure-code/test-erasure-eio.sh +++ b/qa/standalone/erasure-code/test-erasure-eio.sh @@ -354,7 +354,7 @@ function TEST_rados_get_with_subreadall_eio_shard_1() { } # Test recovery the first k copies aren't all available -function TEST_ec_recovery_errors() { +function TEST_ec_single_recovery_error() { local dir=$1 local objname=myobject @@ -376,6 +376,40 @@ function TEST_ec_recovery_errors() { # Cluster should recover this object wait_for_clean || return 1 + rados_get $dir $poolname myobject || return 1 + + delete_erasure_coded_pool $poolname +} + +# Test recovery when repeated reads are needed due to EIO +function TEST_ec_recovery_multiple_errors() { + local dir=$1 + local objname=myobject + + setup_osds 9 || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname 4 4 || return 1 + + rados_put $dir $poolname $objname || return 1 + inject_eio ec data $poolname $objname $dir 0 || return 1 + # first read will try shards 0,1,2 when 0 gets EIO, shard 3 gets + # tried as well. Make that fail to test multiple-EIO handling. + inject_eio ec data $poolname $objname $dir 3 || return 1 + inject_eio ec data $poolname $objname $dir 4 || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + # Kill OSD + kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + # Cluster should recover this object + wait_for_clean || return 1 + + rados_get $dir $poolname myobject || return 1 + delete_erasure_coded_pool $poolname } diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 6bdc4e351e4a5..9e29ecc028345 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1493,6 +1493,7 @@ void ECBackend::call_write_ordered(std::function &&cb) { void ECBackend::get_all_avail_shards( const hobject_t &hoid, + const set &error_shards, set &have, map &shards, bool for_recovery) @@ -1503,6 +1504,8 @@ void ECBackend::get_all_avail_shards( ++i) { dout(10) << __func__ << ": checking acting " << *i << dendl; const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (error_shards.find(*i) != error_shards.end()) + continue; if (!missing.is_missing(hoid)) { assert(!have.count(i->shard)); have.insert(i->shard); @@ -1516,6 +1519,8 @@ void ECBackend::get_all_avail_shards( get_parent()->get_backfill_shards().begin(); i != get_parent()->get_backfill_shards().end(); ++i) { + if (error_shards.find(*i) != error_shards.end()) + continue; if (have.count(i->shard)) { assert(shards.count(i->shard)); continue; @@ -1542,6 +1547,8 @@ void ECBackend::get_all_avail_shards( if (m) { assert(!(*m).is_missing(hoid)); } + if (error_shards.find(*i) != error_shards.end()) + continue; have.insert(i->shard); shards.insert(make_pair(i->shard, *i)); } @@ -1561,8 +1568,9 @@ int ECBackend::get_min_avail_to_read_shards( set have; map shards; + set error_shards; - get_all_avail_shards(hoid, have, shards, for_recovery); + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); set need; int r = ec_impl->minimum_to_decode(want, have, &need); @@ -1588,6 +1596,8 @@ int ECBackend::get_min_avail_to_read_shards( int ECBackend::get_remaining_shards( const hobject_t &hoid, const set &avail, + const set &want, + const read_result_t &result, set *to_read, bool for_recovery) { @@ -1595,15 +1605,34 @@ int ECBackend::get_remaining_shards( set have; map shards; + set error_shards; + for (auto &p : result.errors) { + error_shards.insert(p.first); + } + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + set need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) { + dout(0) << __func__ << " not enough shards left to try for " << hoid + << " read result was " << result << dendl; + return -EIO; + } - get_all_avail_shards(hoid, have, shards, for_recovery); + set shards_left; + for (auto p : need) { + if (avail.find(p) == avail.end()) { + shards_left.insert(p); + } + } - for (set::iterator i = have.begin(); - i != have.end(); + for (set::iterator i = shards_left.begin(); + i != shards_left.end(); ++i) { assert(shards.count(shard_id_t(*i))); - if (avail.find(*i) == avail.end()) - to_read->insert(shards[shard_id_t(*i)]); + assert(avail.find(*i) == avail.end()); + to_read->insert(shards[shard_id_t(*i)]); } return 0; } @@ -2331,15 +2360,11 @@ int ECBackend::send_all_remaining_reads( already_read.insert(i->shard); dout(10) << __func__ << " have/error shards=" << already_read << dendl; set shards; - int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery); + int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid], + rop.complete[hoid], &shards, rop.for_recovery); if (r) return r; - if (shards.empty()) - return -EIO; - - dout(10) << __func__ << " Read remaining shards " << shards << dendl; - // TODOSAM: this doesn't seem right list > offsets = rop.to_read.find(hoid)->second.to_read; GenContext &> *c = diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 7e0c2d08d47cb..98bf89d21f900 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -322,6 +322,7 @@ private: RecoveryMessages *m); void get_all_avail_shards( const hobject_t &hoid, + const set &error_shards, set &have, map &shards, bool for_recovery); @@ -659,6 +660,8 @@ public: int get_remaining_shards( const hobject_t &hoid, const set &avail, + const set &want, + const read_result_t &result, set *to_read, bool for_recovery); -- 2.39.5