From: David Zafman Date: Fri, 28 Apr 2017 00:19:41 +0000 (-0700) Subject: osd: Handle read failures when pushing at primary X-Git-Tag: ses5-milestone8~1^2~19^2~12 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=818553584645cbbedb123945ee9dfa428c76062d;p=ceph.git osd: Handle read failures when pushing at primary Signed-off-by: David Zafman --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index c35bf1ae21c8..7097dda482d8 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -689,7 +689,7 @@ void ECBackend::run_recovery_op( delete _h; } -void ECBackend::recover_object( +int ECBackend::recover_object( const hobject_t &hoid, eversion_t v, ObjectContextRef head, @@ -730,6 +730,7 @@ void ECBackend::recover_object( } } dout(10) << __func__ << ": built op " << h->ops.back() << dendl; + return 0; } bool ECBackend::can_handle_while_inactive( diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index f659675d9ced..8a9c5a30c015 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -41,7 +41,7 @@ public: int priority ) override; - void recover_object( + int recover_object( const hobject_t &hoid, eversion_t v, ObjectContextRef head, diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index b50f0d8c78cb..323d411dc610 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -331,7 +331,7 @@ typedef ceph::shared_ptr OSDMapRef; * @param missing [in] set of info, missing pairs for queried nodes * @param overlaps [in] mapping of object to file offset overlaps */ - virtual void recover_object( + virtual int recover_object( const hobject_t &hoid, ///< [in] object to recover eversion_t v, ///< [in] version to recover ObjectContextRef head, ///< [in] context of the head/snapdir object diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 710bcb54b71a..81aa7a86182a 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -10104,12 +10104,14 @@ int PrimaryLogPG::recover_missing( start_recovery_op(soid); assert(!recovering.count(soid)); recovering.insert(make_pair(soid, obc)); - pgbackend->recover_object( + int r = pgbackend->recover_object( soid, v, head_obc, obc, h); + // This is only a pull which shouldn't return an error + assert(r >= 0); return PULL_YES; } @@ -11267,6 +11269,34 @@ uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handl return started; } +bool PrimaryLogPG::primary_error( + const hobject_t& soid, eversion_t v) +{ + pg_log.missing_add(soid, v, eversion_t()); + pg_log.set_last_requested(0); + missing_loc.remove_location(soid, pg_whoami); + bool uhoh = true; + assert(!actingbackfill.empty()); + for (set::iterator i = actingbackfill.begin(); + i != actingbackfill.end(); + ++i) { + if (*i == get_primary()) continue; + pg_shard_t peer = *i; + if (!peer_missing[peer].is_missing(soid, v)) { + missing_loc.add_location(soid, peer); + dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v + << ", there should be a copy on shard " << peer << dendl; + uhoh = false; + } + } + if (uhoh) + osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound"; + else + osd->clog->error() << info.pgid << " missing primary copy of " << soid + << ", will try copies on " << missing_loc.get_locations(soid); + return uhoh; +} + int PrimaryLogPG::prep_object_replica_pushes( const hobject_t& soid, eversion_t v, PGBackend::RecoveryHandle *h) @@ -11277,27 +11307,7 @@ int PrimaryLogPG::prep_object_replica_pushes( // NOTE: we know we will get a valid oloc off of disk here. ObjectContextRef obc = get_object_context(soid, false); if (!obc) { - pg_log.missing_add(soid, v, eversion_t()); - missing_loc.remove_location(soid, pg_whoami); - bool uhoh = true; - assert(!actingbackfill.empty()); - for (set::iterator i = actingbackfill.begin(); - i != actingbackfill.end(); - ++i) { - if (*i == get_primary()) continue; - pg_shard_t peer = *i; - if (!peer_missing[peer].is_missing(soid, v)) { - missing_loc.add_location(soid, peer); - dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v - << ", there should be a copy on shard " << peer << dendl; - uhoh = false; - } - } - if (uhoh) - osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound"; - else - osd->clog->error() << info.pgid << " missing primary copy of " << soid - << ", will try copies on " << missing_loc.get_locations(soid); + primary_error(soid, v); return 0; } @@ -11320,13 +11330,20 @@ int PrimaryLogPG::prep_object_replica_pushes( * In almost all cases, therefore, this lock should be uncontended. */ obc->ondisk_read_lock(); - pgbackend->recover_object( + int r = pgbackend->recover_object( soid, v, ObjectContextRef(), obc, // has snapset context h); obc->ondisk_read_unlock(); + if (r < 0) { + dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl; + list fl = { pg_whoami }; + failed_push(fl, soid); + primary_error(soid, v); + return 0; + } return 1; } @@ -11889,7 +11906,7 @@ uint64_t PrimaryLogPG::recover_backfill( return ops; } -void PrimaryLogPG::prep_backfill_object_push( +int PrimaryLogPG::prep_backfill_object_push( hobject_t oid, eversion_t v, ObjectContextRef obc, vector peers, @@ -11912,13 +11929,14 @@ void PrimaryLogPG::prep_backfill_object_push( // We need to take the read_lock here in order to flush in-progress writes obc->ondisk_read_lock(); - pgbackend->recover_object( + int r = pgbackend->recover_object( oid, v, ObjectContextRef(), obc, h); obc->ondisk_read_unlock(); + return r; } void PrimaryLogPG::update_range( @@ -13844,19 +13862,8 @@ int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef assert(!pool.info.require_rollback()); assert(is_primary()); - // Get non-primary shards - list op_shards; - for (auto&& i : actingset) { - if (i == pg_whoami) continue; // Exclude self (primary) - op_shards.push_back(i); - } - if (op_shards.empty()) { - dout(0) << __func__ << " No other replicas available for " << soid << dendl; - return -EIO; - } - dout(10) << __func__ << " " << soid - << " peers osd.{" << op_shards << "}" << dendl; + << " peers osd.{" << actingbackfill << "}" << dendl; if (!is_clean()) { block_for_clean(soid, op); @@ -13878,13 +13885,16 @@ int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef return -EIO; } - pg_log.missing_add(soid, oi.version, eversion_t()); - - pg_log.set_last_requested(0); - missing_loc.add_missing(soid, oi.version, eversion_t()); - for (auto &&i : op_shards) - missing_loc.add_location(soid, i); + if (primary_error(soid, oi.version)) { + dout(0) << __func__ << " No other replicas available for " << soid << dendl; + // XXX: If we knew that there is no down osd which could include this + // object, it would be nice if we could return EIO here. + // If a "never fail" flag was available, that could be used + // for rbd to NOT return EIO until object marked lost. + + // Drop through to save this op in case an osd comes up with the object. + } // Restart the op after object becomes readable again waiting_for_unreadable_object[soid].push_back(op); diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 304e751e9d28..4ef55f035b19 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -263,6 +263,7 @@ public: const hobject_t &oid, const object_stat_sum_t &stat_diff) override; void failed_push(const list &from, const hobject_t &soid) override; + bool primary_error(const hobject_t& soid, eversion_t v) override; void cancel_pull(const hobject_t &soid) override; void apply_stats( const hobject_t &soid, @@ -1068,6 +1069,7 @@ protected: hobject_t last_backfill_started; bool new_backfill; + void primary_error(const hobject_t& soid, eversion_t v); int prep_object_replica_pushes(const hobject_t& soid, eversion_t v, PGBackend::RecoveryHandle *h); @@ -1221,7 +1223,7 @@ protected: ThreadPool::TPHandle &handle ///< [in] tp handle ); - void prep_backfill_object_push( + int prep_backfill_object_push( hobject_t oid, eversion_t v, ObjectContextRef obc, vector peers, PGBackend::RecoveryHandle *h); diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 8ef6d5db90a6..3fa1ce38ba9c 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -127,7 +127,7 @@ void ReplicatedBackend::run_recovery_op( delete h; } -void ReplicatedBackend::recover_object( +int ReplicatedBackend::recover_object( const hobject_t &hoid, eversion_t v, ObjectContextRef head, @@ -145,15 +145,18 @@ void ReplicatedBackend::recover_object( hoid, head, h); - return; } else { assert(obc); int started = start_pushes( hoid, obc, h); - assert(started > 0); + if (started < 0) { + pushing[hoid].clear(); + return started; + } } + return 0; } void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap) @@ -855,7 +858,10 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext { assert(j != bc->pulling.end()); ObjectContextRef obc = j->second.obc; bc->clear_pull(j, false /* already did it */); - if (!bc->start_pushes(i.hoid, obc, h)) { + int started = bc->start_pushes(i.hoid, obc, h); + // XXX: Handle errors here? + assert(started >= 0); + if (!started) { bc->get_parent()->on_global_recover( i.hoid, i.stat); } @@ -1474,7 +1480,7 @@ void ReplicatedBackend::prepare_pull( * intelligently push an object to a replica. make use of existing * clones/heads and dup data ranges where possible. */ -void ReplicatedBackend::prep_push_to_replica( +int ReplicatedBackend::prep_push_to_replica( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop, bool cache_dont_need) { @@ -1537,7 +1543,7 @@ void ReplicatedBackend::prep_push_to_replica( lock_manager); } - prep_push( + return prep_push( obc, soid, peer, @@ -1549,7 +1555,7 @@ void ReplicatedBackend::prep_push_to_replica( std::move(lock_manager)); } -void ReplicatedBackend::prep_push(ObjectContextRef obc, +int ReplicatedBackend::prep_push(ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop, bool cache_dont_need) { @@ -1558,12 +1564,12 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc, data_subset.insert(0, obc->obs.oi.size); map> clone_subsets; - prep_push(obc, soid, peer, + return prep_push(obc, soid, peer, obc->obs.oi.version, data_subset, clone_subsets, pop, cache_dont_need, ObcLockManager()); } -void ReplicatedBackend::prep_push( +int ReplicatedBackend::prep_push( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, eversion_t version, @@ -1592,9 +1598,10 @@ void ReplicatedBackend::prep_push( &new_progress, pop, &(pi.stat), cache_dont_need); - // XXX: What can we do here? - assert(r == 0); + if (r < 0) + return r; pi.recovery_progress = new_progress; + return 0; } void ReplicatedBackend::submit_push_data( @@ -2011,8 +2018,9 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, int r = store->read(ch, ghobject_t(recovery_info.soid), p.get_start(), p.get_len(), bit, cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); - if (r < 0) + if (r < 0) { return r; + } if (p.get_len() != bit.length()) { dout(10) << " extent " << p.get_start() << "~" << p.get_len() << " is actually " << p.get_start() << "~" << bit.length() @@ -2236,8 +2244,14 @@ int ReplicatedBackend::start_pushes( if (j->second.is_missing(soid)) { ++pushes; h->pushes[peer].push_back(PushOp()); - prep_push_to_replica(obc, soid, peer, + int r = prep_push_to_replica(obc, soid, peer, &(h->pushes[peer].back()), h->cache_dont_need); + if (r < 0) { + // prep_push_to_replica() should fail on first attempt or not at all + assert(pushes == 1); + h->pushes[peer].pop_back(); + return r; + } } } return pushes; diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index 0b949cac64a8..c2a9282bef41 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -48,7 +48,7 @@ public: int priority) override; /// @see PGBackend::recover_object - void recover_object( + int recover_object( const hobject_t &hoid, eversion_t v, ObjectContextRef head, @@ -297,15 +297,15 @@ private: const hobject_t &soid, ObjectContextRef obj, RPGHandle *h); - void prep_push_to_replica( + int prep_push_to_replica( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop, bool cache_dont_need = true); - void prep_push( + int prep_push( ObjectContextRef obc, const hobject_t& oid, pg_shard_t dest, PushOp *op, bool cache_dont_need); - void prep_push( + int prep_push( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, eversion_t version,