]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Handle recovery read errors
authorDavid Zafman <dzafman@redhat.com>
Fri, 20 May 2016 22:20:18 +0000 (15:20 -0700)
committerDavid Zafman <dzafman@redhat.com>
Thu, 9 Feb 2017 17:16:07 +0000 (09:16 -0800)
Fixes: http://tracker.ceph.com/issues/13937
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit c51d70e1e837c972e42ddd5fa66f7ca4477b95cc)

Conflicts:
src/osd/ReplicatedPG.h (trivial)

src/osd/ECBackend.cc
src/osd/ECBackend.h
src/osd/PGBackend.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h

index b3a47c34bbcd39c803d8cefd2edba2e404837111..0d5f9c8587f32966af44df222e3c85b8a322787d 100644 (file)
@@ -187,6 +187,24 @@ PGBackend::RecoveryHandle *ECBackend::open_recovery_op()
   return new ECRecoveryHandle;
 }
 
+void ECBackend::_failed_push(const hobject_t &hoid,
+  pair<RecoveryMessages *, ECBackend::read_result_t &> &in)
+{
+  ECBackend::read_result_t &res = in.second;
+  dout(10) << __func__ << ": Read error " << hoid << " r="
+          << res.r << " errors=" << res.errors << dendl;
+  dout(10) << __func__ << ": canceling recovery op for obj " << hoid
+          << dendl;
+  assert(recovery_ops.count(hoid));
+  recovery_ops.erase(hoid);
+
+  list<pg_shard_t> fl;
+  for (auto&& i : res.errors) {
+    fl.push_back(i.first);
+  }
+  get_parent()->failed_push(fl, hoid);
+}
+
 struct OnRecoveryReadComplete :
   public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> {
   ECBackend *pg;
@@ -196,9 +214,10 @@ struct OnRecoveryReadComplete :
     : pg(pg), hoid(hoid) {}
   void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) {
     ECBackend::read_result_t &res = in.second;
-    // FIXME???
-    assert(res.r == 0);
-    assert(res.errors.empty());
+    if (!(res.r == 0 && res.errors.empty())) {
+        pg->_failed_push(hoid, in);
+        return;
+    }
     assert(res.returned.size() == 1);
     pg->handle_recovery_read_complete(
       hoid,
@@ -1070,6 +1089,7 @@ void ECBackend::handle_sub_read_reply(
   unsigned is_complete = 0;
   // For redundant reads check for completion as each shard comes in,
   // or in a non-recovery read check for completion once all the shards read.
+  // TODO: It would be nice if recovery could send more reads too
   if (rop.do_redundant_reads || (!rop.for_recovery && rop.in_progress.empty())) {
     for (map<hobject_t, read_result_t>::const_iterator iter =
         rop.complete.begin();
index 6ce0f110df8d6cd5e9bcfd8a32cd83ea33f431d7..63a3092040c449b4f989c792e5476ca664aa308d 100644 (file)
@@ -504,6 +504,8 @@ public:
   uint64_t be_get_ondisk_size(uint64_t logical_size) {
     return sinfo.logical_to_next_chunk_offset(logical_size);
   }
+  void _failed_push(const hobject_t &hoid,
+    pair<RecoveryMessages *, ECBackend::read_result_t &> &in);
 };
 
 #endif
index f88c1a08af76441a523760c933d6bfed156eeb0f..204dd9b0cd0845c5d79ddfdf1b02b6d0c339a652 100644 (file)
@@ -94,7 +94,7 @@ struct shard_info_wrapper;
        pg_shard_t peer,
        const hobject_t oid) = 0;
 
-     virtual void failed_push(pg_shard_t from, const hobject_t &soid) = 0;
+     virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
      
      virtual void cancel_pull(const hobject_t &soid) = 0;
 
index 708ca7855984976c14fb6c40842c13cb2b063966..d44e11504cd640907ca03c12a64412e8d9290ad8 100644 (file)
@@ -2357,7 +2357,8 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op)
 
 void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid)
 {
-  get_parent()->failed_push(from, soid);
+  list<pg_shard_t> fl = { from };
+  get_parent()->failed_push(fl, soid);
   pull_from_peer[from].erase(soid);
   if (pull_from_peer[from].empty())
     pull_from_peer.erase(from);
index 4fd4776503a1ffd6c3fe1d9427bf87ae66b29cfd..ef69bf8917b2b4079c9a4049ad0ab69429fb4e9e 100644 (file)
@@ -9588,12 +9588,12 @@ void ReplicatedPG::recover_got(hobject_t oid, eversion_t v)
   }
 }
 
-
-void ReplicatedPG::failed_push(pg_shard_t from, const hobject_t &soid)
+void ReplicatedPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
 {
   assert(recovering.count(soid));
   recovering.erase(soid);
-  missing_loc.remove_location(soid, from);
+  for (auto&& i : from)
+    missing_loc.remove_location(soid, i);
   dout(0) << __func__ << " " << soid << " from shard " << from
          << ", reps on " << missing_loc.get_locations(soid)
          << " unfound? " << missing_loc.is_unfound(soid) << dendl;
index 149d709c98f8af34dc20700eb72e9d6091d53768..484569e5ced3f05ae321320019808f0822e28fd0 100644 (file)
@@ -280,7 +280,7 @@ public:
   void on_global_recover(
     const hobject_t &oid,
     const object_stat_sum_t &stat_diff);
-  void failed_push(pg_shard_t from, const hobject_t &soid);
+  void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) override;
   void cancel_pull(const hobject_t &soid);
 
   template <typename T>