]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: On EIO from read recover the primary replica from another copy
authorDavid Zafman <dzafman@redhat.com>
Mon, 24 Apr 2017 23:54:06 +0000 (16:54 -0700)
committerDavid Zafman <dzafman@redhat.com>
Fri, 23 Jun 2017 15:02:51 +0000 (08:02 -0700)
Signed-off-by: David Zafman <dzafman@redhat.com>
src/osd/PG.cc
src/osd/PG.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc

index e1eb150d81e40ab48227a1a1a938d8ad2e10ac26..39b373fd478abf9815b23152185c1fdd0ddbfa85 100644 (file)
@@ -6853,7 +6853,7 @@ PG::RecoveryState::Clean::Clean(my_context ctx)
 
   pg->share_pg_info();
   pg->publish_stats_to_osd();
-
+  pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
 }
 
 void PG::RecoveryState::Clean::exit()
index 6213166d7407523e585caa63047959be7992fc0c..8b3fef6d39654920e62a22d6fb878d90ad8610bb 100644 (file)
@@ -256,6 +256,7 @@ protected:
   CephContext *cct;
   OSDriver osdriver;
   SnapMapper snap_mapper;
+  bool eio_errors_to_process = false;
 
   virtual PGBackend *get_pgbackend() = 0;
 public:
@@ -904,6 +905,7 @@ protected:
   list<OpRequestRef>            waiting_for_scrub;
 
   list<OpRequestRef>            waiting_for_cache_not_full;
+  list<OpRequestRef>            waiting_for_clean_to_primary_repair;
   map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
                             waiting_for_degraded_object,
                             waiting_for_blocked_object;
@@ -1855,6 +1857,7 @@ public:
     struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
       typedef boost::mpl::list<
        boost::statechart::transition< GoClean, Clean >,
+       boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
        boost::statechart::custom_reaction< AllReplicasActivated >
       > reactions;
       explicit Recovered(my_context ctx);
index 29354a83baddac44dbdf6852e97ab3f8c55f1280..710bcb54b71ac0d808541ec9f307937148b85446 100644 (file)
@@ -619,6 +619,15 @@ void PrimaryLogPG::block_write_on_full_cache(
   op->mark_delayed("waiting for cache not full");
 }
 
+void PrimaryLogPG::block_for_clean(
+  const hobject_t& oid, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << oid
+          << " on primary repair" << dendl;
+  waiting_for_clean_to_primary_repair.push_back(op);
+  op->mark_delayed("waiting for clean to repair");
+}
+
 void PrimaryLogPG::block_write_on_snap_rollback(
   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 {
@@ -4752,6 +4761,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
        } else {
          int r = pgbackend->objects_read_sync(
            soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
+         if (r == -EIO) {
+           r = rep_repair_primary_object(soid, ctx->op);
+         }
          if (r >= 0)
            op.extent.length = r;
          else {
@@ -4884,6 +4896,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
            bufferlist t;
            uint64_t len = miter->first - last;
            r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
+           if (r == -EIO) {
+             r = rep_repair_primary_object(soid, ctx->op);
+           }
            if (r < 0) {
              osd->clog->error() << coll << " " << soid
                                 << " sparse-read failed to read: "
@@ -10649,6 +10664,7 @@ void PrimaryLogPG::on_activate()
          RequestBackfill())));
   } else {
     dout(10) << "activate all replicas clean, no recovery" << dendl;
+    eio_errors_to_process = false;
     queue_peering_event(
       CephPeeringEvtRef(
        std::make_shared<CephPeeringEvt>(
@@ -11069,6 +11085,7 @@ bool PrimaryLogPG::start_recovery_ops(
             RequestBackfill())));
     } else {
       dout(10) << "recovery done, no backfill" << dendl;
+      eio_errors_to_process = false;
       queue_peering_event(
         CephPeeringEvtRef(
           std::make_shared<CephPeeringEvt>(
@@ -11079,6 +11096,7 @@ bool PrimaryLogPG::start_recovery_ops(
   } else { // backfilling
     state_clear(PG_STATE_BACKFILL);
     dout(10) << "recovery done, backfill done" << dendl;
+    eio_errors_to_process = false;
     queue_peering_event(
       CephPeeringEvtRef(
         std::make_shared<CephPeeringEvt>(
@@ -13820,6 +13838,77 @@ bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
     return osd->check_osdmap_full(missing_on);
 }
 
+int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
+{
+  // Only supports replicated pools
+  assert(!pool.info.require_rollback());
+  assert(is_primary());
+
+  // Get non-primary shards
+  list<pg_shard_t> op_shards;
+  for (auto&& i : actingset) {
+    if (i == pg_whoami) continue; // Exclude self (primary)
+    op_shards.push_back(i);
+  }
+  if (op_shards.empty()) {
+    dout(0) << __func__ << " No other replicas available for " << soid << dendl;
+    return -EIO;
+  }
+
+  dout(10) << __func__ << " " << soid
+          << " peers osd.{" << op_shards << "}" << dendl;
+
+  if (!is_clean()) {
+    block_for_clean(soid, op);
+    return -EAGAIN;
+  }
+
+  assert(!pg_log.get_missing().is_missing(soid));
+  bufferlist bv;
+  int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
+  if (r < 0)
+    return r;
+  object_info_t oi;
+  try {
+    bufferlist::iterator bliter = bv.begin();
+    ::decode(oi, bliter);
+  } catch (...) {
+    dout(0) << __func__ << ":  bad object_info_t: " << soid << dendl;
+    // XXX: Too bad I can't get the version to recover, so can't repair
+    return -EIO;
+  }
+
+  pg_log.missing_add(soid, oi.version, eversion_t());
+
+  pg_log.set_last_requested(0);
+
+  missing_loc.add_missing(soid, oi.version, eversion_t());
+  for (auto &&i : op_shards)
+    missing_loc.add_location(soid, i);
+
+  // Restart the op after object becomes readable again
+  waiting_for_unreadable_object[soid].push_back(op);
+  op->mark_delayed("waiting for missing object");
+
+  if (!eio_errors_to_process) {
+    eio_errors_to_process = true;
+    assert(is_clean());
+    queue_peering_event(
+        CephPeeringEvtRef(
+         std::make_shared<CephPeeringEvt>(
+         get_osdmap()->get_epoch(),
+         get_osdmap()->get_epoch(),
+         DoRecovery())));
+  } else {
+    // A prior error must have already cleared clean state and queued recovery
+    // or a map change has triggered re-peering.
+    // Not inlining the recovery by calling maybe_kick_recovery(soid);
+    dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
+  }
+
+  return -EAGAIN;
+}
+
 /*---SnapTrimmer Logging---*/
 #undef dout_prefix
 #define dout_prefix *_dout << pg->gen_prefix() 
index f6c1b97d8a3614a5b7ec41040413a3a82e222a0f..304e751e9d28c00d7a0a3c7a64dad7b5eac4687f 100644 (file)
@@ -1731,6 +1731,8 @@ public:
 
   void block_write_on_full_cache(
     const hobject_t& oid, OpRequestRef op);
+  void block_for_clean(
+    const hobject_t& oid, OpRequestRef op);
   void block_write_on_snap_rollback(
     const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
   void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
@@ -1763,6 +1765,7 @@ public:
   void on_shutdown() override;
   bool check_failsafe_full(ostream &ss) override;
   bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
+  int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op);
 
   // attr cache handling
   void setattr_maybe_cache(
index 6a63f5415f1ceb41fd14224cc57007d2a4d10c96..8ef6d5db90a697425b77faefd989ea106db03075 100644 (file)
@@ -1592,6 +1592,7 @@ void ReplicatedBackend::prep_push(
                        &new_progress,
                        pop,
                        &(pi.stat), cache_dont_need);
+  // XXX: What can we do here?
   assert(r == 0);
   pi.recovery_progress = new_progress;
 }
@@ -2007,9 +2008,11 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
        p != out_op->data_included.end();
        ++p) {
     bufferlist bit;
-    store->read(ch, ghobject_t(recovery_info.soid),
+    int r = store->read(ch, ghobject_t(recovery_info.soid),
                p.get_start(), p.get_len(), bit,
                 cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
+    if (r < 0)
+      return r;
     if (p.get_len() != bit.length()) {
       dout(10) << " extent " << p.get_start() << "~" << p.get_len()
               << " is actually " << p.get_start() << "~" << bit.length()
@@ -2086,6 +2089,7 @@ bool ReplicatedBackend::handle_push_reply(
        pi->recovery_info,
        pi->recovery_progress, &new_progress, reply,
        &(pi->stat));
+      // XXX: What can we do here?
       assert(r == 0);
       pi->recovery_progress = new_progress;
       return true;