osd/ECBackend: recover from EIO based on the minimum data necessary

author Josh Durgin <jdurgin@redhat.com>

Fri, 20 Apr 2018 22:39:40 +0000 (18:39 -0400)

committer Josh Durgin <jdurgin@redhat.com>

Fri, 20 Apr 2018 23:42:14 +0000 (19:42 -0400)
author Josh Durgin <jdurgin@redhat.com>
Fri, 20 Apr 2018 22:39:40 +0000 (18:39 -0400)
committer Josh Durgin <jdurgin@redhat.com>
Fri, 20 Apr 2018 23:42:14 +0000 (19:42 -0400)
diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh

index f4c3a7318f20226cd2fdba2bb510d27d457d3807..0b7a5b3a7526cf24da08d34a4ad0db6968400ad5 100755 (executable)
--- a/qa/standalone/erasure-code/test-erasure-eio.sh
+++ b/qa/standalone/erasure-code/test-erasure-eio.sh
@@ -354,7 +354,7 @@ function TEST_rados_get_with_subreadall_eio_shard_1() {
  }
  
  # Test recovery the first k copies aren't all available
-function TEST_ec_recovery_errors() {
+function TEST_ec_single_recovery_error() {
      local dir=$1
      local objname=myobject
  
@@ -376,6 +376,40 @@ function TEST_ec_recovery_errors() {
      # Cluster should recover this object
      wait_for_clean || return 1
  
+    rados_get $dir $poolname myobject || return 1
+
+    delete_erasure_coded_pool $poolname
+}
+
+# Test recovery when repeated reads are needed due to EIO
+function TEST_ec_recovery_multiple_errors() {
+    local dir=$1
+    local objname=myobject
+
+    setup_osds 9 || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname 4 4 || return 1
+
+    rados_put $dir $poolname $objname || return 1
+    inject_eio ec data $poolname $objname $dir 0 || return 1
+    # first read will try shards 0,1,2 when 0 gets EIO, shard 3 gets
+    # tried as well. Make that fail to test multiple-EIO handling.
+    inject_eio ec data $poolname $objname $dir 3 || return 1
+    inject_eio ec data $poolname $objname $dir 4 || return 1
+
+    local -a initial_osds=($(get_osds $poolname $objname))
+    local last_osd=${initial_osds[-1]}
+    # Kill OSD
+    kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
+    ceph osd down ${last_osd} || return 1
+    ceph osd out ${last_osd} || return 1
+
+    # Cluster should recover this object
+    wait_for_clean || return 1
+
+    rados_get $dir $poolname myobject || return 1
+
      delete_erasure_coded_pool $poolname
  }
  
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index ec503e4e0018f4fad13309c1a58c9809a088035a..8efdb4f39a4d8f7263109f0fe0955f43140752b4 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1462,6 +1462,7 @@ void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
  
  void ECBackend::get_all_avail_shards(
    const hobject_t &hoid,
+  const set<pg_shard_t> &error_shards,
    set<int> &have,
    map<shard_id_t, pg_shard_t> &shards,
    bool for_recovery)
@@ -1472,6 +1473,8 @@ void ECBackend::get_all_avail_shards(
         ++i) {
      dout(10) << __func__ << ": checking acting " << *i << dendl;
      const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+    if (error_shards.find(*i) != error_shards.end())
+      continue;
      if (!missing.is_missing(hoid)) {
        assert(!have.count(i->shard));
        have.insert(i->shard);
@@ -1485,6 +1488,8 @@ void ECBackend::get_all_avail_shards(
            get_parent()->get_backfill_shards().begin();
          i != get_parent()->get_backfill_shards().end();
          ++i) {
+      if (error_shards.find(*i) != error_shards.end())
+       continue;
        if (have.count(i->shard)) {
         assert(shards.count(i->shard));
         continue;
@@ -1511,6 +1516,8 @@ void ECBackend::get_all_avail_shards(
         if (m) {
           assert(!(*m).is_missing(hoid));
         }
+       if (error_shards.find(*i) != error_shards.end())
+         continue;
         have.insert(i->shard);
         shards.insert(make_pair(i->shard, *i));
        }
@@ -1530,8 +1537,9 @@ int ECBackend::get_min_avail_to_read_shards(
  
    set<int> have;
    map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
  
-  get_all_avail_shards(hoid, have, shards, for_recovery);
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
  
    map<int, vector<pair<int, int>>> need;
    int r = ec_impl->minimum_to_decode(want, have, &need);
@@ -1559,6 +1567,8 @@ int ECBackend::get_min_avail_to_read_shards(
  int ECBackend::get_remaining_shards(
    const hobject_t &hoid,
    const set<int> &avail,
+  const set<int> &want,
+  const read_result_t &result,
    map<pg_shard_t, vector<pair<int, int>>> *to_read,
    bool for_recovery)
  {
@@ -1566,17 +1576,36 @@ int ECBackend::get_remaining_shards(
  
    set<int> have;
    map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
+  for (auto &p : result.errors) {
+    error_shards.insert(p.first);
+  }
+
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+  map<int, vector<pair<int, int>>> need;
+  int r = ec_impl->minimum_to_decode(want, have, &need);
+  if (r < 0) {
+    dout(0) << __func__ << " not enough shards left to try for " << hoid
+           << " read result was " << result << dendl;
+    return -EIO;
+  }
  
-  get_all_avail_shards(hoid, have, shards, for_recovery);
+  set<int> shards_left;
+  for (auto p : need) {
+    if (avail.find(p.first) == avail.end()) {
+      shards_left.insert(p.first);
+    }
+  }
  
    vector<pair<int, int>> subchunks;
    subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
-  for (set<int>::iterator i = have.begin();
-       i != have.end();
+  for (set<int>::iterator i = shards_left.begin();
+       i != shards_left.end();
         ++i) {
      assert(shards.count(shard_id_t(*i)));
-    if (avail.find(*i) == avail.end())
-      to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
+    assert(avail.find(*i) == avail.end());
+    to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
    }
    return 0;
  }
@@ -2298,15 +2327,11 @@ int ECBackend::send_all_remaining_reads(
      already_read.insert(i->shard);
    dout(10) << __func__ << " have/error shards=" << already_read << dendl;
    map<pg_shard_t, vector<pair<int, int>>> shards;
-  int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery);
+  int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
+                              rop.complete[hoid], &shards, rop.for_recovery);
    if (r)
      return r;
-  if (shards.empty())
-    return -EIO;
-
-  dout(10) << __func__ << " Read remaining shards " << shards << dendl;
  
-  // TODOSAM: this doesn't seem right
    list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets =
      rop.to_read.find(hoid)->second.to_read;
    GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h

index 7b02d1a2055be70e2a83a9e6a4d6b49ab4aa501d..b647e6c597fa5c530ad69f602944f509564b9b21 100644 (file)
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -313,6 +313,7 @@ private:
      RecoveryMessages *m);
    void get_all_avail_shards(
      const hobject_t &hoid,
+    const set<pg_shard_t> &error_shards,
      set<int> &have,
      map<shard_id_t, pg_shard_t> &shards,
      bool for_recovery);
@@ -649,6 +650,8 @@ public:
    int get_remaining_shards(
      const hobject_t &hoid,
      const set<int> &avail,
+    const set<int> &want,
+    const read_result_t &result,
      map<pg_shard_t, vector<pair<int, int>>> *to_read,
      bool for_recovery);
author	Josh Durgin <jdurgin@redhat.com>
	Fri, 20 Apr 2018 22:39:40 +0000 (18:39 -0400)
committer	Josh Durgin <jdurgin@redhat.com>
	Fri, 20 Apr 2018 23:42:14 +0000 (19:42 -0400)
qa/standalone/erasure-code/test-erasure-eio.sh		patch \| blob \| history
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/ECBackend.h		patch \| blob \| history