]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Handle read failures when pushing at primary
authorDavid Zafman <dzafman@redhat.com>
Fri, 28 Apr 2017 00:19:41 +0000 (17:19 -0700)
committerDavid Zafman <dzafman@redhat.com>
Fri, 23 Jun 2017 15:02:51 +0000 (08:02 -0700)
Signed-off-by: David Zafman <dzafman@redhat.com>
src/osd/ECBackend.cc
src/osd/ECBackend.h
src/osd/PGBackend.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedBackend.h

index c35bf1ae21c8d3ee61eafbe60750a9343ff60e33..7097dda482d812a7f6a17598ff9ad9e46e39c02c 100644 (file)
@@ -689,7 +689,7 @@ void ECBackend::run_recovery_op(
   delete _h;
 }
 
-void ECBackend::recover_object(
+int ECBackend::recover_object(
   const hobject_t &hoid,
   eversion_t v,
   ObjectContextRef head,
@@ -730,6 +730,7 @@ void ECBackend::recover_object(
     }
   }
   dout(10) << __func__ << ": built op " << h->ops.back() << dendl;
+  return 0;
 }
 
 bool ECBackend::can_handle_while_inactive(
index f659675d9cedb81232e7c1a9b33f8e7e1ee9fdc8..8a9c5a30c01575ad0d7aed7c825e2d3c8a762027 100644 (file)
@@ -41,7 +41,7 @@ public:
     int priority
     ) override;
 
-  void recover_object(
+  int recover_object(
     const hobject_t &hoid,
     eversion_t v,
     ObjectContextRef head,
index b50f0d8c78cbdab2db7bbaa835b13fd13571b6cf..323d411dc610476e1c22fd222857d5f6ee36e513 100644 (file)
@@ -331,7 +331,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
     * @param missing [in] set of info, missing pairs for queried nodes
     * @param overlaps [in] mapping of object to file offset overlaps
     */
-   virtual void recover_object(
+   virtual int recover_object(
      const hobject_t &hoid, ///< [in] object to recover
      eversion_t v,          ///< [in] version to recover
      ObjectContextRef head,  ///< [in] context of the head/snapdir object
index 710bcb54b71ac0d808541ec9f307937148b85446..81aa7a86182a2b8872f2131b5dbd4c30a59a09ae 100644 (file)
@@ -10104,12 +10104,14 @@ int PrimaryLogPG::recover_missing(
   start_recovery_op(soid);
   assert(!recovering.count(soid));
   recovering.insert(make_pair(soid, obc));
-  pgbackend->recover_object(
+  int r = pgbackend->recover_object(
     soid,
     v,
     head_obc,
     obc,
     h);
+  // This is only a pull which shouldn't return an error
+  assert(r >= 0);
   return PULL_YES;
 }
 
@@ -11267,6 +11269,34 @@ uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handl
   return started;
 }
 
+bool PrimaryLogPG::primary_error(
+  const hobject_t& soid, eversion_t v)
+{
+  pg_log.missing_add(soid, v, eversion_t());
+  pg_log.set_last_requested(0);
+  missing_loc.remove_location(soid, pg_whoami);
+  bool uhoh = true;
+  assert(!actingbackfill.empty());
+  for (set<pg_shard_t>::iterator i = actingbackfill.begin();
+       i != actingbackfill.end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
+    if (!peer_missing[peer].is_missing(soid, v)) {
+      missing_loc.add_location(soid, peer);
+      dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
+              << ", there should be a copy on shard " << peer << dendl;
+      uhoh = false;
+    }
+  }
+  if (uhoh)
+    osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
+  else
+    osd->clog->error() << info.pgid << " missing primary copy of " << soid
+                        << ", will try copies on " << missing_loc.get_locations(soid);
+  return uhoh;
+}
+
 int PrimaryLogPG::prep_object_replica_pushes(
   const hobject_t& soid, eversion_t v,
   PGBackend::RecoveryHandle *h)
@@ -11277,27 +11307,7 @@ int PrimaryLogPG::prep_object_replica_pushes(
   // NOTE: we know we will get a valid oloc off of disk here.
   ObjectContextRef obc = get_object_context(soid, false);
   if (!obc) {
-    pg_log.missing_add(soid, v, eversion_t());
-    missing_loc.remove_location(soid, pg_whoami);
-    bool uhoh = true;
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
-        ++i) {
-      if (*i == get_primary()) continue;
-      pg_shard_t peer = *i;
-      if (!peer_missing[peer].is_missing(soid, v)) {
-       missing_loc.add_location(soid, peer);
-       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
-                << ", there should be a copy on shard " << peer << dendl;
-       uhoh = false;
-      }
-    }
-    if (uhoh)
-      osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
-    else
-      osd->clog->error() << info.pgid << " missing primary copy of " << soid
-                        << ", will try copies on " << missing_loc.get_locations(soid);
+    primary_error(soid, v);
     return 0;
   }
 
@@ -11320,13 +11330,20 @@ int PrimaryLogPG::prep_object_replica_pushes(
    * In almost all cases, therefore, this lock should be uncontended.
    */
   obc->ondisk_read_lock();
-  pgbackend->recover_object(
+  int r = pgbackend->recover_object(
     soid,
     v,
     ObjectContextRef(),
     obc, // has snapset context
     h);
   obc->ondisk_read_unlock();
+  if (r < 0) {
+    dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
+    list<pg_shard_t> fl = { pg_whoami };
+    failed_push(fl, soid);
+    primary_error(soid, v);
+    return 0;
+  }
   return 1;
 }
 
@@ -11889,7 +11906,7 @@ uint64_t PrimaryLogPG::recover_backfill(
   return ops;
 }
 
-void PrimaryLogPG::prep_backfill_object_push(
+int PrimaryLogPG::prep_backfill_object_push(
   hobject_t oid, eversion_t v,
   ObjectContextRef obc,
   vector<pg_shard_t> peers,
@@ -11912,13 +11929,14 @@ void PrimaryLogPG::prep_backfill_object_push(
 
   // We need to take the read_lock here in order to flush in-progress writes
   obc->ondisk_read_lock();
-  pgbackend->recover_object(
+  int r = pgbackend->recover_object(
     oid,
     v,
     ObjectContextRef(),
     obc,
     h);
   obc->ondisk_read_unlock();
+  return r;
 }
 
 void PrimaryLogPG::update_range(
@@ -13844,19 +13862,8 @@ int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef
   assert(!pool.info.require_rollback());
   assert(is_primary());
 
-  // Get non-primary shards
-  list<pg_shard_t> op_shards;
-  for (auto&& i : actingset) {
-    if (i == pg_whoami) continue; // Exclude self (primary)
-    op_shards.push_back(i);
-  }
-  if (op_shards.empty()) {
-    dout(0) << __func__ << " No other replicas available for " << soid << dendl;
-    return -EIO;
-  }
-
   dout(10) << __func__ << " " << soid
-          << " peers osd.{" << op_shards << "}" << dendl;
+          << " peers osd.{" << actingbackfill << "}" << dendl;
 
   if (!is_clean()) {
     block_for_clean(soid, op);
@@ -13878,13 +13885,16 @@ int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef
     return -EIO;
   }
 
-  pg_log.missing_add(soid, oi.version, eversion_t());
-
-  pg_log.set_last_requested(0);
-
   missing_loc.add_missing(soid, oi.version, eversion_t());
-  for (auto &&i : op_shards)
-    missing_loc.add_location(soid, i);
+  if (primary_error(soid, oi.version)) {
+    dout(0) << __func__ << " No other replicas available for " << soid << dendl;
+    // XXX: If we knew that there is no down osd which could include this
+    // object, it would be nice if we could return EIO here.
+    // If a "never fail" flag was available, that could be used
+    // for rbd to NOT return EIO until object marked lost.
+
+    // Drop through to save this op in case an osd comes up with the object.
+  }
 
   // Restart the op after object becomes readable again
   waiting_for_unreadable_object[soid].push_back(op);
index 304e751e9d28c00d7a0a3c7a64dad7b5eac4687f..4ef55f035b19b67ab59192c998bbe37fee267bf5 100644 (file)
@@ -263,6 +263,7 @@ public:
     const hobject_t &oid,
     const object_stat_sum_t &stat_diff) override;
   void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) override;
+  bool primary_error(const hobject_t& soid, eversion_t v) override;
   void cancel_pull(const hobject_t &soid) override;
   void apply_stats(
     const hobject_t &soid,
@@ -1068,6 +1069,7 @@ protected:
   hobject_t last_backfill_started;
   bool new_backfill;
 
+  void primary_error(const hobject_t& soid, eversion_t v);
   int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
                                 PGBackend::RecoveryHandle *h);
 
@@ -1221,7 +1223,7 @@ protected:
     ThreadPool::TPHandle &handle ///< [in] tp handle
     );
 
-  void prep_backfill_object_push(
+  int prep_backfill_object_push(
     hobject_t oid, eversion_t v, ObjectContextRef obc,
     vector<pg_shard_t> peers,
     PGBackend::RecoveryHandle *h);
index 8ef6d5db90a697425b77faefd989ea106db03075..3fa1ce38ba9ccdd8e0f71b159f68e3a0c00a4702 100644 (file)
@@ -127,7 +127,7 @@ void ReplicatedBackend::run_recovery_op(
   delete h;
 }
 
-void ReplicatedBackend::recover_object(
+int ReplicatedBackend::recover_object(
   const hobject_t &hoid,
   eversion_t v,
   ObjectContextRef head,
@@ -145,15 +145,18 @@ void ReplicatedBackend::recover_object(
       hoid,
       head,
       h);
-    return;
   } else {
     assert(obc);
     int started = start_pushes(
       hoid,
       obc,
       h);
-    assert(started > 0);
+    if (started < 0) {
+      pushing[hoid].clear();
+      return started;
+    }
   }
+  return 0;
 }
 
 void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap)
@@ -855,7 +858,10 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
       assert(j != bc->pulling.end());
       ObjectContextRef obc = j->second.obc;
       bc->clear_pull(j, false /* already did it */);
-      if (!bc->start_pushes(i.hoid, obc, h)) {
+      int started = bc->start_pushes(i.hoid, obc, h);
+      // XXX: Handle errors here?
+      assert(started >= 0);
+      if (!started) {
        bc->get_parent()->on_global_recover(
          i.hoid, i.stat);
       }
@@ -1474,7 +1480,7 @@ void ReplicatedBackend::prepare_pull(
  * intelligently push an object to a replica.  make use of existing
  * clones/heads and dup data ranges where possible.
  */
-void ReplicatedBackend::prep_push_to_replica(
+int ReplicatedBackend::prep_push_to_replica(
   ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
   PushOp *pop, bool cache_dont_need)
 {
@@ -1537,7 +1543,7 @@ void ReplicatedBackend::prep_push_to_replica(
       lock_manager);
   }
 
-  prep_push(
+  return prep_push(
     obc,
     soid,
     peer,
@@ -1549,7 +1555,7 @@ void ReplicatedBackend::prep_push_to_replica(
     std::move(lock_manager));
 }
 
-void ReplicatedBackend::prep_push(ObjectContextRef obc,
+int ReplicatedBackend::prep_push(ObjectContextRef obc,
                             const hobject_t& soid, pg_shard_t peer,
                             PushOp *pop, bool cache_dont_need)
 {
@@ -1558,12 +1564,12 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc,
     data_subset.insert(0, obc->obs.oi.size);
   map<hobject_t, interval_set<uint64_t>> clone_subsets;
 
-  prep_push(obc, soid, peer,
+  return prep_push(obc, soid, peer,
            obc->obs.oi.version, data_subset, clone_subsets,
            pop, cache_dont_need, ObcLockManager());
 }
 
-void ReplicatedBackend::prep_push(
+int ReplicatedBackend::prep_push(
   ObjectContextRef obc,
   const hobject_t& soid, pg_shard_t peer,
   eversion_t version,
@@ -1592,9 +1598,10 @@ void ReplicatedBackend::prep_push(
                        &new_progress,
                        pop,
                        &(pi.stat), cache_dont_need);
-  // XXX: What can we do here?
-  assert(r == 0);
+  if (r < 0)
+    return r;
   pi.recovery_progress = new_progress;
+  return 0;
 }
 
 void ReplicatedBackend::submit_push_data(
@@ -2011,8 +2018,9 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
     int r = store->read(ch, ghobject_t(recovery_info.soid),
                p.get_start(), p.get_len(), bit,
                 cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
-    if (r < 0)
+    if (r < 0) {
       return r;
+    }
     if (p.get_len() != bit.length()) {
       dout(10) << " extent " << p.get_start() << "~" << p.get_len()
               << " is actually " << p.get_start() << "~" << bit.length()
@@ -2236,8 +2244,14 @@ int ReplicatedBackend::start_pushes(
     if (j->second.is_missing(soid)) {
       ++pushes;
       h->pushes[peer].push_back(PushOp());
-      prep_push_to_replica(obc, soid, peer,
+      int r = prep_push_to_replica(obc, soid, peer,
                           &(h->pushes[peer].back()), h->cache_dont_need);
+      if (r < 0) {
+       // prep_push_to_replica() should fail on first attempt or not at all
+       assert(pushes == 1);
+       h->pushes[peer].pop_back();
+       return r;
+      }
     }
   }
   return pushes;
index 0b949cac64a8a19046fbd2f22ed2514c69c45acb..c2a9282bef4100f5ce4257baeb9f5e04f1442ac4 100644 (file)
@@ -48,7 +48,7 @@ public:
     int priority) override;
 
   /// @see PGBackend::recover_object
-  void recover_object(
+  int recover_object(
     const hobject_t &hoid,
     eversion_t v,
     ObjectContextRef head,
@@ -297,15 +297,15 @@ private:
     const hobject_t &soid,
     ObjectContextRef obj,
     RPGHandle *h);
-  void prep_push_to_replica(
+  int prep_push_to_replica(
     ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
     PushOp *pop, bool cache_dont_need = true);
-  void prep_push(
+  int prep_push(
     ObjectContextRef obc,
     const hobject_t& oid, pg_shard_t dest,
     PushOp *op,
     bool cache_dont_need);
-  void prep_push(
+  int prep_push(
     ObjectContextRef obc,
     const hobject_t& soid, pg_shard_t peer,
     eversion_t version,