From 68defc2b0561414711d4dd0a76bc5d0f46f8a3f8 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 23 Nov 2016 15:41:13 -0800 Subject: [PATCH] ReplicatedBackend: take read locks for clone sources during recovery Otherwise, we run the risk of a clone source which hasn't actually come into existence yet being used if we grab a clone which *just* got added the the ssc, but has not yet actually had time to be created (can't rely on message ordering here since recovery messages don't necessarily order with client IO!). Fixes: http://tracker.ceph.com/issues/17831 Signed-off-by: Samuel Just --- src/osd/PGBackend.h | 6 ++ src/osd/PrimaryLogPG.h | 16 +++- src/osd/ReplicatedBackend.cc | 142 +++++++++++++++++++++++------------ src/osd/ReplicatedBackend.h | 59 ++++++++++----- src/osd/osd_types.h | 21 ++++++ 5 files changed, 175 insertions(+), 69 deletions(-) diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index d4e0091ca735b..46d66e7194edf 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -186,6 +186,12 @@ typedef ceph::shared_ptr OSDMapRef; const hobject_t &hoid, map &attrs) = 0; + virtual bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) = 0; + + virtual void release_locks(ObcLockManager &manager) = 0; + virtual void op_applied( const eversion_t &applied_version) = 0; diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 019e871e12d7e..114a21c662551 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -335,11 +335,26 @@ public: const pg_pool_t &get_pool() const override { return pool.info; } + ObjectContextRef get_obc( const hobject_t &hoid, map &attrs) override { return get_object_context(hoid, true, &attrs); } + + bool try_lock_for_read( + const hobject_t &hoid, + ObcLockManager &manager) override { + auto obc = get_object_context(hoid, false, nullptr); + if (!obc) + return false; + return manager.try_get_read_lock(hoid, obc); + } + + void release_locks(ObcLockManager &manager) { + release_object_locks(manager); + } + void pgb_set_object_snap_mapping( const hobject_t &soid, const set &snaps, @@ -352,7 +367,6 @@ public: return clear_object_snap_mapping(t, soid); } - void log_operation( const vector &logv, boost::optional &hset_history, diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index c051db8828efa..070bbbf0150ef 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -166,9 +166,8 @@ void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap) for (set::iterator j = i->second.begin(); j != i->second.end(); ++j) { - assert(pulling.count(*j) == 1); get_parent()->cancel_pull(*j); - pulling.erase(*j); + clear_pull(pulling.find(*j), false); } pull_from_peer.erase(i++); } else { @@ -274,7 +273,16 @@ bool ReplicatedBackend::handle_message( void ReplicatedBackend::clear_recovery_state() { // clear pushing/pulling maps + for (auto &&i: pushing) { + for (auto &&j: i.second) { + get_parent()->release_locks(j.second.lock_manager); + } + } pushing.clear(); + + for (auto &&i: pulling) { + get_parent()->release_locks(i.second.lock_manager); + } pulling.clear(); pull_from_peer.clear(); } @@ -859,25 +867,18 @@ void ReplicatedBackend::_do_push(OpRequestRef op) struct C_ReplicatedBackend_OnPullComplete : GenContext { ReplicatedBackend *bc; - list to_continue; + list to_continue; int priority; C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority) : bc(bc), priority(priority) {} void finish(ThreadPool::TPHandle &handle) { ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op(); - for (list::iterator i = - to_continue.begin(); - i != to_continue.end(); - ++i) { - map::iterator j = - bc->pulling.find(*i); - assert(j != bc->pulling.end()); - if (!bc->start_pushes(*i, j->second.obc, h)) { + for (auto &&i: to_continue) { + if (!bc->start_pushes(i.hoid, i.obc, h)) { bc->get_parent()->on_global_recover( - *i, j->second.stat); + i.hoid, i.stat); } - bc->pulling.erase(*i); handle.reset_tp_timeout(); } bc->run_recovery_op(h, priority); @@ -894,7 +895,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) vector replies(1); ObjectStore::Transaction t; - list to_continue; + list to_continue; for (vector::iterator i = m->pushes.begin(); i != m->pushes.end(); ++i) { @@ -1255,7 +1256,8 @@ void ReplicatedBackend::calc_head_subsets( const pg_missing_t& missing, const hobject_t &last_backfill, interval_set& data_subset, - map, hobject_t::BitwiseComparator>& clone_subsets) + map, hobject_t::BitwiseComparator>& clone_subsets, + ObcLockManager &manager) { dout(10) << "calc_head_subsets " << head << " clone_overlap " << snapset.clone_overlap << dendl; @@ -1285,7 +1287,8 @@ void ReplicatedBackend::calc_head_subsets( c.snap = snapset.clones[j]; prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); if (!missing.is_missing(c) && - cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) { + cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0 && + get_parent()->try_lock_for_read(c, manager)) { dout(10) << "calc_head_subsets " << head << " has prev " << c << " overlap " << prev << dendl; clone_subsets[c] = prev; @@ -1299,6 +1302,7 @@ void ReplicatedBackend::calc_head_subsets( if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); clone_subsets.clear(); cloning.clear(); } @@ -1316,7 +1320,8 @@ void ReplicatedBackend::calc_clone_subsets( const pg_missing_t& missing, const hobject_t &last_backfill, interval_set& data_subset, - map, hobject_t::BitwiseComparator>& clone_subsets) + map, hobject_t::BitwiseComparator>& clone_subsets, + ObcLockManager &manager) { dout(10) << "calc_clone_subsets " << soid << " clone_overlap " << snapset.clone_overlap << dendl; @@ -1350,7 +1355,8 @@ void ReplicatedBackend::calc_clone_subsets( c.snap = snapset.clones[j]; prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); if (!missing.is_missing(c) && - cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) { + cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0 && + get_parent()->try_lock_for_read(c, manager)) { dout(10) << "calc_clone_subsets " << soid << " has prev " << c << " overlap " << prev << dendl; clone_subsets[c] = prev; @@ -1370,7 +1376,8 @@ void ReplicatedBackend::calc_clone_subsets( c.snap = snapset.clones[j]; next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]); if (!missing.is_missing(c) && - cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0) { + cmp(c, last_backfill, get_parent()->sort_bitwise()) < 0 && + get_parent()->try_lock_for_read(c, manager)) { dout(10) << "calc_clone_subsets " << soid << " has next " << c << " overlap " << next << dendl; clone_subsets[c] = next; @@ -1383,6 +1390,7 @@ void ReplicatedBackend::calc_clone_subsets( if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { dout(10) << "skipping clone, too many holes" << dendl; + get_parent()->release_locks(manager); clone_subsets.clear(); cloning.clear(); } @@ -1444,6 +1452,7 @@ void ReplicatedBackend::prepare_pull( } ObjectRecoveryInfo recovery_info; + ObcLockManager lock_manager; if (soid.is_snap()) { assert(!get_parent()->get_local_missing().is_missing( @@ -1455,10 +1464,12 @@ void ReplicatedBackend::prepare_pull( SnapSetContext *ssc = headctx->ssc; assert(ssc); dout(10) << " snapset " << ssc->snapset << dendl; - calc_clone_subsets(ssc->snapset, soid, get_parent()->get_local_missing(), - get_info().last_backfill, - recovery_info.copy_subset, - recovery_info.clone_subset); + calc_clone_subsets( + ssc->snapset, soid, get_parent()->get_local_missing(), + get_info().last_backfill, + recovery_info.copy_subset, + recovery_info.clone_subset, + lock_manager); // FIXME: this may overestimate if we are pulling multiple clones in parallel... dout(10) << " pulling " << recovery_info << dendl; @@ -1486,10 +1497,13 @@ void ReplicatedBackend::prepare_pull( assert(!pulling.count(soid)); pull_from_peer[fromshard].insert(soid); PullInfo &pi = pulling[soid]; + pi.from = fromshard; + pi.soid = soid; pi.head_ctx = headctx; pi.recovery_info = op.recovery_info; pi.recovery_progress = op.recovery_progress; pi.cache_dont_need = h->cache_dont_need; + pi.lock_manager = std::move(lock_manager); } /* @@ -1509,6 +1523,7 @@ void ReplicatedBackend::prep_push_to_replica( map, hobject_t::BitwiseComparator> clone_subsets; interval_set data_subset; + ObcLockManager lock_manager; // are we doing a clone on the replica? if (soid.snap && soid.snap < CEPH_NOSNAP) { hobject_t head = soid; @@ -1537,10 +1552,12 @@ void ReplicatedBackend::prep_push_to_replica( map::const_iterator pi = get_parent()->get_shard_info().find(peer); assert(pi != get_parent()->get_shard_info().end()); - calc_clone_subsets(ssc->snapset, soid, - pm->second, - pi->second.last_backfill, - data_subset, clone_subsets); + calc_clone_subsets( + ssc->snapset, soid, + pm->second, + pi->second.last_backfill, + data_subset, clone_subsets, + lock_manager); } else if (soid.snap == CEPH_NOSNAP) { // pushing head or unversioned object. // base this on partially on replica's clones? @@ -1551,10 +1568,20 @@ void ReplicatedBackend::prep_push_to_replica( obc, ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second, get_parent()->get_shard_info().find(peer)->second.last_backfill, - data_subset, clone_subsets); + data_subset, clone_subsets, + lock_manager); } - prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop, cache_dont_need); + prep_push( + obc, + soid, + peer, + oi.version, + data_subset, + clone_subsets, + pop, + cache_dont_need, + std::move(lock_manager)); } void ReplicatedBackend::prep_push(ObjectContextRef obc, @@ -1568,7 +1595,7 @@ void ReplicatedBackend::prep_push(ObjectContextRef obc, prep_push(obc, soid, peer, obc->obs.oi.version, data_subset, clone_subsets, - pop, cache_dont_need); + pop, cache_dont_need, ObcLockManager()); } void ReplicatedBackend::prep_push( @@ -1578,7 +1605,8 @@ void ReplicatedBackend::prep_push( interval_set &data_subset, map, hobject_t::BitwiseComparator>& clone_subsets, PushOp *pop, - bool cache_dont_need) + bool cache_dont_need, + ObcLockManager &&lock_manager) { get_parent()->begin_peer_recover(peer, soid); // take note. @@ -1594,6 +1622,7 @@ void ReplicatedBackend::prep_push( pi.recovery_progress.data_recovered_to = 0; pi.recovery_progress.data_complete = 0; pi.recovery_progress.omap_complete = 0; + pi.lock_manager = std::move(lock_manager); ObjectRecoveryProgress new_progress; int r = build_push_op(pi.recovery_info, @@ -1731,7 +1760,8 @@ void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info, ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( const ObjectRecoveryInfo& recovery_info, - SnapSetContext *ssc) + SnapSetContext *ssc, + ObcLockManager &manager) { if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP) return recovery_info; @@ -1739,17 +1769,19 @@ ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( new_info.copy_subset.clear(); new_info.clone_subset.clear(); assert(ssc); - calc_clone_subsets(ssc->snapset, new_info.soid, get_parent()->get_local_missing(), - get_info().last_backfill, - new_info.copy_subset, new_info.clone_subset); + get_parent()->release_locks(manager); // might already have locks + calc_clone_subsets( + ssc->snapset, new_info.soid, get_parent()->get_local_missing(), + get_info().last_backfill, + new_info.copy_subset, new_info.clone_subset, + manager); return new_info; } bool ReplicatedBackend::handle_pull_response( pg_shard_t from, PushOp &pop, PullOp *response, - list *to_continue, - ObjectStore::Transaction *t - ) + list *to_continue, + ObjectStore::Transaction *t) { interval_set data_included = pop.data_included; bufferlist data; @@ -1794,7 +1826,10 @@ bool ReplicatedBackend::handle_pull_response( } pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset); pi.recovery_info.oi = pi.obc->obs.oi; - pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc); + pi.recovery_info = recalc_subsets( + pi.recovery_info, + pi.obc->ssc, + pi.lock_manager); } @@ -1830,12 +1865,10 @@ bool ReplicatedBackend::handle_pull_response( if (complete) { pi.stat.num_objects_recovered++; - to_continue->push_back(hoid); + to_continue->push_back({hoid, pi.obc, pi.stat}); get_parent()->on_local_recover( hoid, pi.recovery_info, pi.obc, t); - pull_from_peer[from].erase(hoid); - if (pull_from_peer[from].empty()) - pull_from_peer.erase(from); + clear_pull(pulling.find(hoid)); return false; } else { response->soid = pop.soid; @@ -2182,6 +2215,7 @@ bool ReplicatedBackend::handle_push_reply(pg_shard_t peer, PushReplyOp &op, Push stat.num_keys_recovered = reply->omap_entries.size(); stat.num_objects_recovered = 1; + get_parent()->release_locks(pi->lock_manager); pushing[soid].erase(peer); pi = NULL; @@ -2328,7 +2362,7 @@ void ReplicatedBackend::sub_op_push(OpRequestRef op) if (is_primary()) { PullOp resp; RPGHandle *h = _open_recovery_op(); - list to_continue; + list to_continue; bool more = handle_pull_response( m->from, pop, &resp, &to_continue, &t); @@ -2369,10 +2403,22 @@ void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid) { list fl = { from }; get_parent()->failed_push(fl, soid); - pull_from_peer[from].erase(soid); - if (pull_from_peer[from].empty()) - pull_from_peer.erase(from); - pulling.erase(soid); + + clear_pull(pulling.find(soid)); +} + +void ReplicatedBackend::clear_pull( + map::iterator piter, + bool clear_pull_from_peer) +{ + auto from = piter->second.from; + if (clear_pull_from_peer) { + pull_from_peer[from].erase(piter->second.soid); + if (pull_from_peer[from].empty()) + pull_from_peer.erase(from); + } + get_parent()->release_locks(piter->second.lock_manager); + pulling.erase(piter); } int ReplicatedBackend::start_pushes( diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index b9431cc0a8d01..67e8784238cfe 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -169,6 +169,7 @@ private: ObjectRecoveryInfo recovery_info; ObjectContextRef obc; object_stat_sum_t stat; + ObcLockManager lock_manager; void dump(Formatter *f) const { { @@ -187,12 +188,15 @@ private: // pull struct PullInfo { + pg_shard_t from; + hobject_t soid; ObjectRecoveryProgress recovery_progress; ObjectRecoveryInfo recovery_info; ObjectContextRef head_ctx; ObjectContextRef obc; object_stat_sum_t stat; bool cache_dont_need; + ObcLockManager lock_manager; void dump(Formatter *f) const { { @@ -216,6 +220,9 @@ private: // Reverse mapping from osd peer to objects beging pulled from that peer map > pull_from_peer; + void clear_pull( + map::iterator piter, + bool clear_pull_from_peer = true); void sub_op_push(OpRequestRef op); void sub_op_push_reply(OpRequestRef op); @@ -235,9 +242,15 @@ private: bool handle_push_reply(pg_shard_t peer, PushReplyOp &op, PushOp *reply); void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply); + + struct pull_complete_info { + hobject_t hoid; + ObjectContextRef obc; + object_stat_sum_t stat; + }; bool handle_pull_response( pg_shard_t from, PushOp &op, PullOp *response, - list *to_continue, + list *to_continue, ObjectStore::Transaction *t); void handle_push(pg_shard_t from, PushOp &op, PushReplyOp *response, ObjectStore::Transaction *t); @@ -283,7 +296,8 @@ private: SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing, const hobject_t &last_backfill, interval_set& data_subset, - map, hobject_t::BitwiseComparator>& clone_subsets); + map, hobject_t::BitwiseComparator>& clone_subsets, + ObcLockManager &lock_manager); void prepare_pull( eversion_t v, const hobject_t& soid, @@ -296,26 +310,31 @@ private: void prep_push_to_replica( ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, PushOp *pop, bool cache_dont_need = true); - void prep_push(ObjectContextRef obc, - const hobject_t& oid, pg_shard_t dest, - PushOp *op, - bool cache_dont_need); - void prep_push(ObjectContextRef obc, - const hobject_t& soid, pg_shard_t peer, - eversion_t version, - interval_set &data_subset, - map, hobject_t::BitwiseComparator>& clone_subsets, - PushOp *op, - bool cache = false); - void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, - const pg_missing_t& missing, - const hobject_t &last_backfill, - interval_set& data_subset, - map, hobject_t::BitwiseComparator>& clone_subsets); + void prep_push( + ObjectContextRef obc, + const hobject_t& oid, pg_shard_t dest, + PushOp *op, + bool cache_dont_need); + void prep_push( + ObjectContextRef obc, + const hobject_t& soid, pg_shard_t peer, + eversion_t version, + interval_set &data_subset, + map, hobject_t::BitwiseComparator>& clone_subsets, + PushOp *op, + bool cache, + ObcLockManager &&lock_manager); + void calc_head_subsets( + ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, + const pg_missing_t& missing, + const hobject_t &last_backfill, + interval_set& data_subset, + map, hobject_t::BitwiseComparator>& clone_subsets, + ObcLockManager &lock_manager); ObjectRecoveryInfo recalc_subsets( const ObjectRecoveryInfo& recovery_info, - SnapSetContext *ssc - ); + SnapSetContext *ssc, + ObcLockManager &lock_manager); /** * Client IO diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index f8aeb083fc1b7..212a32ef5a1c1 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4345,6 +4345,9 @@ public: } return false; } + bool try_get_read_lock() { + return rwstate.get_read_lock(); + } void drop_recovery_read(list *ls) { assert(rwstate.recovery_read_marker); rwstate.put_read(ls); @@ -4504,6 +4507,7 @@ public: ObcLockManager() = default; ObcLockManager(ObcLockManager &&) = default; ObcLockManager(const ObcLockManager &) = delete; + ObcLockManager &operator=(ObcLockManager &&) = default; bool empty() const { return locks.empty(); } @@ -4564,6 +4568,23 @@ public: return false; } } + + /// try get read lock + bool try_get_read_lock( + const hobject_t &hoid, + ObjectContextRef obc) { + assert(locks.find(hoid) == locks.end()); + if (obc->try_get_read_lock()) { + locks.insert( + make_pair( + hoid, + ObjectLockState(obc, ObjectContext::RWState::RWREAD))); + return true; + } else { + return false; + } + } + void put_locks( list > > *to_requeue, bool *requeue_recovery, -- 2.39.5