From: Josh Durgin Date: Fri, 20 Apr 2018 22:39:40 +0000 (-0400) Subject: osd/ECBackend: recover from EIO based on the minimum data necessary X-Git-Tag: v13.1.0~121^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b162a5478d6a907cc0a9ddd5ae8442e81f8d8fb3;p=ceph.git osd/ECBackend: recover from EIO based on the minimum data necessary Discount shards that already returned EIO, and use minimum_to_decode() to request just what is necessary to recover or read the originally requested extents of the object. Signed-off-by: Josh Durgin --- diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh index f4c3a7318f20..0b7a5b3a7526 100755 --- a/qa/standalone/erasure-code/test-erasure-eio.sh +++ b/qa/standalone/erasure-code/test-erasure-eio.sh @@ -354,7 +354,7 @@ function TEST_rados_get_with_subreadall_eio_shard_1() { } # Test recovery the first k copies aren't all available -function TEST_ec_recovery_errors() { +function TEST_ec_single_recovery_error() { local dir=$1 local objname=myobject @@ -376,6 +376,40 @@ function TEST_ec_recovery_errors() { # Cluster should recover this object wait_for_clean || return 1 + rados_get $dir $poolname myobject || return 1 + + delete_erasure_coded_pool $poolname +} + +# Test recovery when repeated reads are needed due to EIO +function TEST_ec_recovery_multiple_errors() { + local dir=$1 + local objname=myobject + + setup_osds 9 || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname 4 4 || return 1 + + rados_put $dir $poolname $objname || return 1 + inject_eio ec data $poolname $objname $dir 0 || return 1 + # first read will try shards 0,1,2 when 0 gets EIO, shard 3 gets + # tried as well. Make that fail to test multiple-EIO handling. + inject_eio ec data $poolname $objname $dir 3 || return 1 + inject_eio ec data $poolname $objname $dir 4 || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + # Kill OSD + kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + # Cluster should recover this object + wait_for_clean || return 1 + + rados_get $dir $poolname myobject || return 1 + delete_erasure_coded_pool $poolname } diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index ec503e4e0018..8efdb4f39a4d 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1462,6 +1462,7 @@ void ECBackend::call_write_ordered(std::function &&cb) { void ECBackend::get_all_avail_shards( const hobject_t &hoid, + const set &error_shards, set &have, map &shards, bool for_recovery) @@ -1472,6 +1473,8 @@ void ECBackend::get_all_avail_shards( ++i) { dout(10) << __func__ << ": checking acting " << *i << dendl; const pg_missing_t &missing = get_parent()->get_shard_missing(*i); + if (error_shards.find(*i) != error_shards.end()) + continue; if (!missing.is_missing(hoid)) { assert(!have.count(i->shard)); have.insert(i->shard); @@ -1485,6 +1488,8 @@ void ECBackend::get_all_avail_shards( get_parent()->get_backfill_shards().begin(); i != get_parent()->get_backfill_shards().end(); ++i) { + if (error_shards.find(*i) != error_shards.end()) + continue; if (have.count(i->shard)) { assert(shards.count(i->shard)); continue; @@ -1511,6 +1516,8 @@ void ECBackend::get_all_avail_shards( if (m) { assert(!(*m).is_missing(hoid)); } + if (error_shards.find(*i) != error_shards.end()) + continue; have.insert(i->shard); shards.insert(make_pair(i->shard, *i)); } @@ -1530,8 +1537,9 @@ int ECBackend::get_min_avail_to_read_shards( set have; map shards; + set error_shards; - get_all_avail_shards(hoid, have, shards, for_recovery); + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); map>> need; int r = ec_impl->minimum_to_decode(want, have, &need); @@ -1559,6 +1567,8 @@ int ECBackend::get_min_avail_to_read_shards( int ECBackend::get_remaining_shards( const hobject_t &hoid, const set &avail, + const set &want, + const read_result_t &result, map>> *to_read, bool for_recovery) { @@ -1566,17 +1576,36 @@ int ECBackend::get_remaining_shards( set have; map shards; + set error_shards; + for (auto &p : result.errors) { + error_shards.insert(p.first); + } + + get_all_avail_shards(hoid, error_shards, have, shards, for_recovery); + + map>> need; + int r = ec_impl->minimum_to_decode(want, have, &need); + if (r < 0) { + dout(0) << __func__ << " not enough shards left to try for " << hoid + << " read result was " << result << dendl; + return -EIO; + } - get_all_avail_shards(hoid, have, shards, for_recovery); + set shards_left; + for (auto p : need) { + if (avail.find(p.first) == avail.end()) { + shards_left.insert(p.first); + } + } vector> subchunks; subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count())); - for (set::iterator i = have.begin(); - i != have.end(); + for (set::iterator i = shards_left.begin(); + i != shards_left.end(); ++i) { assert(shards.count(shard_id_t(*i))); - if (avail.find(*i) == avail.end()) - to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); + assert(avail.find(*i) == avail.end()); + to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks)); } return 0; } @@ -2298,15 +2327,11 @@ int ECBackend::send_all_remaining_reads( already_read.insert(i->shard); dout(10) << __func__ << " have/error shards=" << already_read << dendl; map>> shards; - int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery); + int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid], + rop.complete[hoid], &shards, rop.for_recovery); if (r) return r; - if (shards.empty()) - return -EIO; - - dout(10) << __func__ << " Read remaining shards " << shards << dendl; - // TODOSAM: this doesn't seem right list > offsets = rop.to_read.find(hoid)->second.to_read; GenContext &> *c = diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 7b02d1a2055b..b647e6c597fa 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -313,6 +313,7 @@ private: RecoveryMessages *m); void get_all_avail_shards( const hobject_t &hoid, + const set &error_shards, set &have, map &shards, bool for_recovery); @@ -649,6 +650,8 @@ public: int get_remaining_shards( const hobject_t &hoid, const set &avail, + const set &want, + const read_result_t &result, map>> *to_read, bool for_recovery);