From: David Zafman Date: Mon, 1 May 2017 16:13:16 +0000 (-0700) Subject: osd: Handle read errors during backfill X-Git-Tag: ses5-milestone8~1^2~19^2~11 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=346ec3fb6154d620ad53f06550e9ab20c3b47b3a;p=ceph.git osd: Handle read errors during backfill Signed-off-by: David Zafman --- diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 81aa7a86182a..f8a4e910b964 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -11378,8 +11378,14 @@ uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &hand handle.reset_tp_timeout(); const hobject_t soid(p->second); + if (missing_loc.is_unfound(soid)) { + dout(10) << __func__ << ": " << soid << " still unfound" << dendl; + continue; + } + if (soid > pi->second.last_backfill) { if (!recovering.count(soid)) { + derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl; derr << __func__ << ": object added to missing set for backfill, but " << "is not in recovering, error!" << dendl; ceph_abort(); @@ -11392,11 +11398,6 @@ uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &hand continue; } - if (missing_loc.is_unfound(soid)) { - dout(10) << __func__ << ": " << soid << " still unfound" << dendl; - continue; - } - if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) { dout(10) << __func__ << ": " << soid.get_head() << " still missing on primary" << dendl; @@ -11534,8 +11535,6 @@ uint64_t PrimaryLogPG::recover_backfill( update_range(&backfill_info, handle); unsigned ops = 0; - vector > > to_push; vector > to_remove; set add_to_stat; @@ -11547,6 +11546,7 @@ uint64_t PrimaryLogPG::recover_backfill( } backfill_info.trim_to(last_backfill_started); + PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); while (ops < max) { if (backfill_info.begin <= earliest_peer_backfill() && !backfill_info.extends_to_end() && backfill_info.empty()) { @@ -11726,10 +11726,13 @@ uint64_t PrimaryLogPG::recover_backfill( vector all_push = need_ver_targs; all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end()); - to_push.push_back( - boost::tuple > - (backfill_info.begin, obj_v, obc, all_push)); - // Count all simultaneous pushes of the same object as a single op + handle.reset_tp_timeout(); + int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h); + if (r < 0) { + *work_started = true; + dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl; + break; + } ops++; } else { *work_started = true; @@ -11810,12 +11813,6 @@ uint64_t PrimaryLogPG::recover_backfill( } } - PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op(); - for (unsigned i = 0; i < to_push.size(); ++i) { - handle.reset_tp_timeout(); - prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(), - to_push[i].get<2>(), to_push[i].get<3>(), h); - } pgbackend->run_recovery_op(h, get_recovery_op_priority()); dout(5) << "backfill_pos is " << backfill_pos << dendl; @@ -11912,7 +11909,7 @@ int PrimaryLogPG::prep_backfill_object_push( vector peers, PGBackend::RecoveryHandle *h) { - dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl; + dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl; assert(!peers.empty()); backfills_in_flight.insert(oid); @@ -11936,6 +11933,14 @@ int PrimaryLogPG::prep_backfill_object_push( obc, h); obc->ondisk_read_unlock(); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl; + list fl = { pg_whoami }; + failed_push(fl, oid); + primary_error(oid, v); + backfills_in_flight.erase(oid); + missing_loc.add_missing(oid, v, eversion_t()); + } return r; } diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 3fa1ce38ba9c..9bb7abc34bee 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -2224,6 +2224,9 @@ void ReplicatedBackend::clear_pull( pulling.erase(piter); } +// This can read the local replica multiple times. This +// isn't so bad as long as the ObjectStore caches and +// h->cache_dont_need is NOT true. int ReplicatedBackend::start_pushes( const hobject_t &soid, ObjectContextRef obc,