From c41d4dc4bbdd541ab3a94271ca76fbe1c8a3bedb Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 14 Jun 2013 13:44:34 -0700 Subject: [PATCH] ReplicatedPG: send pushes en mass in recover_replicas, recover_backfill This way, the pushes might be later merged into a smaller number of messages. Signed-off-by: Samuel Just --- src/osd/ReplicatedPG.cc | 92 +++++++++++++++++++++++++++++------------ src/osd/ReplicatedPG.h | 35 +++++++++------- 2 files changed, 86 insertions(+), 41 deletions(-) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 237b80f119a28..7d44e56ba9d61 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -175,7 +175,9 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef break; } } - recover_object_replicas(soid, v, g_conf->osd_client_op_priority); + map > pushes; + prep_object_replica_pushes(soid, v, g_conf->osd_client_op_priority, &pushes); + send_pushes(g_conf->osd_client_op_priority, pushes); } waiting_for_degraded_object[soid].push_back(op); op->mark_delayed("waiting for degraded object"); @@ -5258,14 +5260,16 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer) * intelligently push an object to a replica. make use of existing * clones/heads and dup data ranges where possible. */ -void ReplicatedPG::push_to_replica( - ObjectContext *obc, const hobject_t& soid, int peer, - int prio) +void ReplicatedPG::prep_push_to_replica( + ObjectContext *obc, const hobject_t& soid, int peer, + int prio, + PushOp *pop) { const object_info_t& oi = obc->obs.oi; uint64_t size = obc->obs.oi.size; - dout(10) << "push_to_replica " << soid << " v" << oi.version << " size " << size << " to osd." << peer << dendl; + dout(10) << __func__ << soid << " v" << oi.version + << " size " << size << " to osd." << peer << dendl; map > clone_subsets; interval_set data_subset; @@ -5279,13 +5283,13 @@ void ReplicatedPG::push_to_replica( // we need the head (and current SnapSet) locally to do that. if (pg_log.get_missing().is_missing(head)) { dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl; - return push_start(prio, obc, soid, peer); + return prep_push(prio, obc, soid, peer, pop); } hobject_t snapdir = head; snapdir.snap = CEPH_SNAPDIR; if (pg_log.get_missing().is_missing(snapdir)) { dout(15) << "push_to_replica missing snapdir " << snapdir << ", pushing raw clone" << dendl; - return push_start(prio, obc, soid, peer); + return prep_push(prio, obc, soid, peer, pop); } SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false); @@ -5307,29 +5311,32 @@ void ReplicatedPG::push_to_replica( put_snapset_context(ssc); } - push_start(prio, obc, soid, peer, oi.version, data_subset, clone_subsets); + prep_push(prio, obc, soid, peer, oi.version, data_subset, clone_subsets, pop); } -void ReplicatedPG::push_start(int prio, - ObjectContext *obc, - const hobject_t& soid, int peer) +void ReplicatedPG::prep_push(int prio, + ObjectContext *obc, + const hobject_t& soid, int peer, + PushOp *pop) { interval_set data_subset; if (obc->obs.oi.size) data_subset.insert(0, obc->obs.oi.size); map > clone_subsets; - push_start(prio, obc, soid, peer, - obc->obs.oi.version, data_subset, clone_subsets); + prep_push(prio, obc, soid, peer, + obc->obs.oi.version, data_subset, clone_subsets, + pop); } -void ReplicatedPG::push_start( +void ReplicatedPG::prep_push( int prio, ObjectContext *obc, const hobject_t& soid, int peer, eversion_t version, interval_set &data_subset, - map >& clone_subsets) + map >& clone_subsets, + PushOp *pop) { peer_missing[peer].revise_have(soid, eversion_t()); // take note. @@ -5347,9 +5354,10 @@ void ReplicatedPG::push_start( pi.priority = prio; ObjectRecoveryProgress new_progress; - send_push(pi.priority, - peer, pi.recovery_info, - pi.recovery_progress, &new_progress); + build_push_op(pi.recovery_info, + pi.recovery_progress, + &new_progress, + pop); pi.recovery_progress = new_progress; } @@ -5692,6 +5700,19 @@ void ReplicatedPG::handle_push( info.last_complete)); } +void ReplicatedPG::send_pushes(int prio, map > &pushes) +{ + for (map >::iterator i = pushes.begin(); + i != pushes.end(); + ++i) { + for (vector::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + send_push_op(prio, i->first, *j); + } + } +} + int ReplicatedPG::send_push(int prio, int peer, const ObjectRecoveryInfo &recovery_info, const ObjectRecoveryProgress &progress, @@ -6918,10 +6939,11 @@ int ReplicatedPG::recover_primary(int max) return started; } -int ReplicatedPG::recover_object_replicas( - const hobject_t& soid, eversion_t v, int prio) +int ReplicatedPG::prep_object_replica_pushes( + const hobject_t& soid, eversion_t v, int prio, + map > *pushes) { - dout(10) << "recover_object_replicas " << soid << dendl; + dout(10) << __func__ << ": on " << soid << dendl; // NOTE: we know we will get a valid oloc off of disk here. ObjectContext *obc = get_object_context(soid, OLOC_BLANK, false); @@ -6959,7 +6981,10 @@ int ReplicatedPG::recover_object_replicas( start_recovery_op(soid); started = true; } - push_to_replica(obc, soid, peer, prio); + (*pushes)[peer].push_back(PushOp()); + prep_push_to_replica(obc, soid, peer, prio, + &((*pushes)[peer].back()) + ); } } @@ -6975,6 +7000,8 @@ int ReplicatedPG::recover_replicas(int max) dout(10) << __func__ << "(" << max << ")" << dendl; int started = 0; + map > pushes; + // this is FAR from an optimal recovery order. pretty lame, really. for (unsigned i=1; i::const_iterator r = m.missing.find(soid); - started += recover_object_replicas(soid, r->second.need, - g_conf->osd_recovery_op_priority); + started += prep_object_replica_pushes(soid, r->second.need, + g_conf->osd_recovery_op_priority, + &pushes); } } + send_pushes(g_conf->osd_recovery_op_priority, pushes); + return started; } @@ -7170,11 +7200,15 @@ int ReplicatedPG::recover_backfill(int max) ++i) { send_remove_op(i->first, i->second, backfill_target); } + + map > pushes; for (map >::iterator i = to_push.begin(); i != to_push.end(); ++i) { - push_backfill_object(i->first, i->second.first, i->second.second, backfill_target); + prep_backfill_object_push( + i->first, i->second.first, i->second.second, backfill_target, &pushes); } + send_pushes(g_conf->osd_recovery_op_priority, pushes); release_waiting_for_backfill_pos(); dout(5) << "backfill_pos is " << backfill_pos << " and pinfo.last_backfill is " @@ -7218,7 +7252,9 @@ int ReplicatedPG::recover_backfill(int max) return ops; } -void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, eversion_t have, int peer) +void ReplicatedPG::prep_backfill_object_push( + hobject_t oid, eversion_t v, eversion_t have, int peer, + map > *pushes) { dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl; @@ -7228,7 +7264,9 @@ void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, eversion_t start_recovery_op(oid); ObjectContext *obc = get_object_context(oid, OLOC_BLANK, false); obc->ondisk_read_lock(); - push_to_replica(obc, oid, peer, g_conf->osd_recovery_op_priority); + (*pushes)[peer].push_back(PushOp()); + prep_push_to_replica(obc, oid, peer, g_conf->osd_recovery_op_priority, + &((*pushes)[peer].back())); obc->ondisk_read_unlock(); put_object_context(obc); } diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index a20ebad9cff10..6121227c9b884 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -562,6 +562,7 @@ protected: void handle_push( int from, PushOp &op, PushReplyOp *response, ObjectStore::Transaction *t); + void send_pushes(int prio, map > &pushes); int send_push(int priority, int peer, const ObjectRecoveryInfo& recovery_info, const ObjectRecoveryProgress &progress, @@ -687,8 +688,9 @@ protected: // Reverse mapping from osd peer to objects beging pulled from that peer map > pull_from_peer; - int recover_object_replicas(const hobject_t& soid, eversion_t v, - int priority); + int prep_object_replica_pushes(const hobject_t& soid, eversion_t v, + int priority, + map > *pushes); void calc_head_subsets(ObjectContext *obc, SnapSet& snapset, const hobject_t& head, pg_missing_t& missing, const hobject_t &last_backfill, @@ -698,20 +700,23 @@ protected: const hobject_t &last_backfill, interval_set& data_subset, map >& clone_subsets); - void push_to_replica( + void prep_push_to_replica( ObjectContext *obc, const hobject_t& oid, int dest, - int priority); - void push_start(int priority, - ObjectContext *obc, - const hobject_t& oid, int dest); - void push_start(int priority, - ObjectContext *obc, - const hobject_t& soid, int peer, - eversion_t version, - interval_set &data_subset, - map >& clone_subsets); + int priority, + PushOp *push_op); + void prep_push(int priority, + ObjectContext *obc, + const hobject_t& oid, int dest, + PushOp *op); + void prep_push(int priority, + ObjectContext *obc, + const hobject_t& soid, int peer, + eversion_t version, + interval_set &data_subset, + map >& clone_subsets, + PushOp *op); void send_push_op_blank(const hobject_t& soid, int peer); void finish_degraded_object(const hobject_t& oid); @@ -758,7 +763,9 @@ protected: */ void scan_range(hobject_t begin, int min, int max, BackfillInterval *bi); - void push_backfill_object(hobject_t oid, eversion_t v, eversion_t have, int peer); + void prep_backfill_object_push( + hobject_t oid, eversion_t v, eversion_t have, int peer, + map > *pushes); void send_remove_op(const hobject_t& oid, eversion_t v, int peer); -- 2.39.5