]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: retry to read object attrs at EC recovery 22394/head
authorcuixf <cuixiaofei@sangfor.com.cn>
Thu, 24 May 2018 00:01:25 +0000 (20:01 -0400)
committerKefu Chai <kchai@redhat.com>
Mon, 4 Jun 2018 14:54:59 +0000 (22:54 +0800)
In EC recovery read, if the object's attrs read failed or with errors, we erase the attrs we have read and
try to read it again from left shards. This will make the primary osd get the object's attrs correct and
avoid assert.

Fixes: http://tracker.ceph.com/issues/24406
Signed-off-by: xiaofei cui <cuixiaofei@sangfor.com>
(cherry picked from commit 3eb1679b1fadb6adb6ff4ed3f1f6069a85f4bbcd)

qa/standalone/erasure-code/test-erasure-eio.sh
src/osd/ECBackend.cc

index 309df13340cf74ac4ef18dba3098f054fb01dee8..a37258b2fd86327fa4049efb398ebc26284f6d22 100755 (executable)
@@ -353,6 +353,37 @@ function TEST_rados_get_with_subreadall_eio_shard_1() {
     delete_erasure_coded_pool $poolname
 }
 
+# Test recovery the object attr read error
+function TEST_ec_object_attr_read_error() {
+    local dir=$1
+    local objname=myobject
+
+    setup_osds 7 || return 1
+
+    local poolname=pool-jerasure
+    create_erasure_coded_pool $poolname 3 2 || return 1
+
+    local primary_osd=$(get_primary $poolname $objname)
+    # Kill primary OSD
+    kill_daemons $dir TERM osd.${primary_osd} >&2 < /dev/null || return 1
+
+    # Write data
+    rados_put $dir $poolname $objname || return 1
+
+    # Inject eio, shard 1 is the one read attr
+    inject_eio ec mdata $poolname $objname $dir 1 || return 1
+
+    # Restart OSD
+    run_osd $dir ${primary_osd} || return 1
+
+    # Cluster should recover this object
+    wait_for_clean || return 1
+
+    rados_get $dir $poolname myobject || return 1
+
+    delete_erasure_coded_pool $poolname
+}
+
 # Test recovery the first k copies aren't all available
 function TEST_ec_single_recovery_error() {
     local dir=$1
index 2fde70fdbe16c9f20a7d64ed0c1d2c9a38b732be..6c6e955e72d57dc30579b57801e24174c6a86d53 100644 (file)
@@ -1059,6 +1059,8 @@ error:
        *i, ghobject_t::NO_GEN, shard),
       reply->attrs_read[*i]);
     if (r < 0) {
+      // If we read error, we should not return the attrs too.
+      reply->attrs_read.erase(*i);
       reply->buffers_read.erase(*i);
       reply->errors[*i] = r;
     }
@@ -2342,13 +2344,21 @@ int ECBackend::send_all_remaining_reads(
   GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
     rop.to_read.find(hoid)->second.cb;
 
+  // (Note cuixf) If we need to read attrs and we read failed, try to read again.
+  bool want_attrs =
+    rop.to_read.find(hoid)->second.want_attrs &&
+    (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty());
+  if (want_attrs) {
+    dout(10) << __func__ << " want attrs again" << dendl;
+  }
+
   rop.to_read.erase(hoid);
   rop.to_read.insert(make_pair(
       hoid,
       read_request_t(
        offsets,
        shards,
-       false,
+       want_attrs,
        c)));
   do_read_op(rop);
   return 0;