From: David Zafman Date: Fri, 8 Nov 2013 03:45:46 +0000 (-0800) Subject: osd: Interim working version with backfill reserve state changes X-Git-Tag: v0.77~23^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=695255ecf91381ccb24e1efa0bb0555fb14dfd25;p=ceph.git osd: Interim working version with backfill reserve state changes Signed-off-by: David Zafman --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 66392037270..35806a00b9f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1029,9 +1029,9 @@ bool PG::choose_acting(int& newest_update_osd) compat_mode = false; } - // For now we only backfill 1 at a time as before - if (!backfill.empty()) - backfill.resize(1); + if (compat_mode && !backfill.empty()) { + backfill.resize(1); + } // This might cause a problem if min_size is large // and we need to backfill more than 1 osd. Older @@ -4747,8 +4747,8 @@ ostream& operator<<(ostream& out, const PG& pg) } } - if (pg.get_backfill_target() >= 0) - out << " bft=" << pg.get_backfill_target(); + if (!pg.backfill_targets.empty()) + out << " bft=" << pg.backfill_targets; if (pg.last_complete_ondisk != pg.info.last_complete) out << " lcod " << pg.last_complete_ondisk; @@ -5454,6 +5454,7 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx) PG *pg = context< RecoveryMachine >().pg; pg->backfill_reserved = true; pg->osd->queue_for_recovery(pg); + pg->state_clear(PG_STATE_BACKFILL_TOOFULL); pg->state_clear(PG_STATE_BACKFILL_WAIT); pg->state_set(PG_STATE_BACKFILL); } @@ -5486,27 +5487,43 @@ void PG::RecoveryState::Backfilling::exit() PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) : my_base(ctx), - NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved") + NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved"), + backfill_osd_it(context< Active >().sorted_backfill_set.begin()) { context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; pg->state_set(PG_STATE_BACKFILL_WAIT); - ConnectionRef con = pg->osd->get_con_osd_cluster( - pg->get_backfill_target(), pg->get_osdmap()->get_epoch()); - if (con) { - if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) { - unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH + post_event(RemoteBackfillReserved()); +} + +boost::statechart::result +PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + + if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) { + //The primary never backfills itself + assert(*backfill_osd_it != pg->osd->whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster(*backfill_osd_it, pg->get_osdmap()->get_epoch()); + if (con) { + if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) { + unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH : OSDService::BACKFILL_LOW; - pg->osd->send_message_osd_cluster( - new MBackfillReserve( + pg->osd->send_message_osd_cluster( + new MBackfillReserve( MBackfillReserve::REQUEST, pg->info.pgid, pg->get_osdmap()->get_epoch(), priority), con.get()); - } else { - post_event(RemoteBackfillReserved()); + } else { + post_event(RemoteBackfillReserved()); + } } + ++backfill_osd_it; + } else { + post_event(AllBackfillsReserved()); } + return discard_event(); } void PG::RecoveryState::WaitRemoteBackfillReserved::exit() @@ -5517,14 +5534,6 @@ void PG::RecoveryState::WaitRemoteBackfillReserved::exit() pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur); } -boost::statechart::result -PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) -{ - PG *pg = context< RecoveryMachine >().pg; - pg->state_clear(PG_STATE_BACKFILL_TOOFULL); - return transit(); -} - boost::statechart::result PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt) { @@ -5817,7 +5826,6 @@ void PG::RecoveryState::Recovering::release_reservations() { PG *pg = context< RecoveryMachine >().pg; assert(!pg->pg_log.get_missing().have_missing()); - pg->state_clear(PG_STATE_RECOVERING); // release remote reservations for (set::const_iterator i = context< Active >().sorted_acting_set.begin(); @@ -5933,6 +5941,8 @@ PG::RecoveryState::Active::Active(my_context ctx) NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"), sorted_acting_set(context< RecoveryMachine >().pg->actingbackfill.begin(), context< RecoveryMachine >().pg->actingbackfill.end()), + sorted_backfill_set(context< RecoveryMachine >().pg->backfill_targets.begin(), + context< RecoveryMachine >().pg->backfill_targets.end()), all_replicas_activated(false) { context< RecoveryMachine >().log_enter(state_name); diff --git a/src/osd/PG.h b/src/osd/PG.h index 4047dd4b3fc..ae827dd430e 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -507,20 +507,13 @@ protected: BackfillInterval backfill_info; BackfillInterval peer_backfill_info; - vector backfill_targets; bool backfill_reserved; bool backfill_reserving; friend class OSD; public: - // Compatibility with single backfill target code - int get_backfill_target() const { - int backfill_target = -1; - if (backfill_targets.size() > 0) - backfill_target = backfill_targets[0]; - return backfill_target; - } + vector backfill_targets; protected: @@ -1083,8 +1076,8 @@ public: TrivialEvent(LocalRecoveryReserved) TrivialEvent(RemoteRecoveryReserved) TrivialEvent(AllRemotesReserved) + TrivialEvent(AllBackfillsReserved) TrivialEvent(Recovering) - TrivialEvent(WaitRemoteBackfillReserved) TrivialEvent(GoClean) TrivialEvent(AllReplicasActivated) @@ -1310,6 +1303,7 @@ public: void exit(); const set sorted_acting_set; + const set sorted_backfill_set; bool all_replicas_activated; typedef boost::mpl::list < @@ -1368,8 +1362,10 @@ public: struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { typedef boost::mpl::list< boost::statechart::custom_reaction< RemoteBackfillReserved >, - boost::statechart::custom_reaction< RemoteReservationRejected > + boost::statechart::custom_reaction< RemoteReservationRejected >, + boost::statechart::transition< AllBackfillsReserved, Backfilling > > reactions; + set::const_iterator backfill_osd_it; WaitRemoteBackfillReserved(my_context ctx); void exit(); boost::statechart::result react(const RemoteBackfillReserved& evt); diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index ffa29a24c04..64d655336d4 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -274,7 +274,8 @@ void ReplicatedPG::on_peer_recover( publish_stats_to_osd(); // done! peer_missing[peer].got(soid, recovery_info.version); - if (peer == get_backfill_target() && backfills_in_flight.count(soid)) { + if (!backfill_targets.empty() && peer == backfill_targets[0] + && backfills_in_flight.count(soid)) { map::iterator i = recovering.find(soid); assert(i != recovering.end()); list requeue_list; @@ -364,7 +365,7 @@ bool ReplicatedPG::is_degraded_object(const hobject_t& soid) // Object is degraded if after last_backfill AND // we are backfilling it - if (peer == get_backfill_target() && + if (!backfill_targets.empty() && peer == backfill_targets[0] && peer_info[peer].last_backfill <= soid && last_backfill_started >= soid && backfills_in_flight.count(soid)) @@ -1168,10 +1169,9 @@ void ReplicatedPG::do_op(OpRequestRef op) // The last_backfill_started is used as the backfill line since // that determines the boundary for writes. pg_info_t *backfill_target_info = NULL; - int backfill_target = get_backfill_target(); bool before_backfill = false; - if (backfill_target >= 0) { - backfill_target_info = &peer_info[backfill_target]; + if (!backfill_targets.empty()) { + backfill_target_info = &peer_info[backfill_targets[0]]; before_backfill = obc->obs.oi.soid <= last_backfill_started; } @@ -1842,7 +1842,8 @@ void ReplicatedPG::do_scan( case MOSDPGScan::OP_SCAN_DIGEST: { int from = m->get_source().num(); - assert(from == get_backfill_target()); + //XXX: Check that from is in backfill_targets vector + //assert(from == get_backfill_target()); BackfillInterval& bi = peer_backfill_info; bi.begin = m->begin; bi.end = m->end; @@ -4599,9 +4600,8 @@ void ReplicatedPG::finish_ctx(OpContext *ctx) ctx->obc->ssc->snapset = ctx->new_snapset; info.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category); - int backfill_target = get_backfill_target(); - if (backfill_target >= 0) { - pg_info_t& pinfo = peer_info[backfill_target]; + if (!backfill_targets.empty()) { + pg_info_t& pinfo = peer_info[backfill_targets[0]]; if (soid <= pinfo.last_backfill) pinfo.stats.stats.add(ctx->delta_stats, ctx->obs->oi.category); else if (soid <= last_backfill_started) @@ -5635,9 +5635,8 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now) assert(0 == "broken implementation, do not use"); } - int backfill_target = get_backfill_target(); // ship resulting transaction, log entries, and pg_stats - if (peer == backfill_target && soid > last_backfill_started && + if (!backfill_targets.empty() && peer == backfill_targets[0] && soid > last_backfill_started && // only skip normal (not temp pool=-1) objects soid.pool == (int64_t)info.pgid.pool()) { dout(10) << "issue_repop shipping empty opt to osd." << peer @@ -5652,7 +5651,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now) ::encode(repop->ctx->log, wr->logbl); - if (backfill_target >= 0 && backfill_target == peer) + if (!backfill_targets.empty() && backfill_targets[0] == peer) wr->pg_stats = pinfo.stats; // reflects backfill progress else wr->pg_stats = info.stats; @@ -7821,7 +7820,7 @@ eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid) for (unsigned i=1; i= 0 && started < max && + !backfill_targets.empty() && started < max && missing.num_missing() == 0 && !waiting_on_backfill) { if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) { @@ -8757,10 +8754,10 @@ int ReplicatedPG::recover_backfill( ThreadPool::TPHandle &handle, bool *work_started) { dout(10) << "recover_backfill (" << max << ")" << dendl; - int backfill_target = get_backfill_target(); - assert(backfill_target >= 0); + assert(!backfill_targets.empty()); - pg_info_t& pinfo = peer_info[backfill_target]; + //XXX: Look through backfill_targets + pg_info_t& pinfo = peer_info[backfill_targets[0]]; BackfillInterval& pbi = peer_backfill_info; // Initialize from prior backfill state @@ -8769,7 +8766,7 @@ int ReplicatedPG::recover_backfill( backfill_info.reset(pinfo.last_backfill); } - dout(10) << " peer osd." << backfill_target + dout(10) << " peer osd." << backfill_targets[0] << " last_backfill_started " << last_backfill_started << " info " << pinfo << " interval " << pbi.begin << "-" << pbi.end @@ -8807,11 +8804,11 @@ int ReplicatedPG::recover_backfill( if (pbi.begin <= backfill_info.begin && !pbi.extends_to_end() && pbi.empty()) { - dout(10) << " scanning peer osd." << backfill_target << " from " << pbi.end << dendl; + dout(10) << " scanning peer osd." << backfill_targets[0] << " from " << pbi.end << dendl; epoch_t e = get_osdmap()->get_epoch(); MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid, pbi.end, hobject_t()); - osd->send_message_osd_cluster(backfill_target, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(backfill_targets[0], m, get_osdmap()->get_epoch()); waiting_on_backfill = true; start_recovery_op(pbi.end); ops++; @@ -8874,7 +8871,7 @@ int ReplicatedPG::recover_backfill( if (obc->get_backfill_read()) { dout(20) << " pushing local " << backfill_info.begin << " " << backfill_info.objects.begin()->second - << " to peer osd." << backfill_target << dendl; + << " to peer osd." << backfill_targets[0] << dendl; to_push[backfill_info.begin] = boost::make_tuple( backfill_info.objects.begin()->second, @@ -8908,7 +8905,7 @@ int ReplicatedPG::recover_backfill( handle.reset_tp_timeout(); // ordered before any subsequent updates - send_remove_op(i->first, i->second, backfill_target); + send_remove_op(i->first, i->second, backfill_targets[0]); pending_backfill_updates[i->first]; // add empty stat! } @@ -8922,7 +8919,7 @@ int ReplicatedPG::recover_backfill( handle.reset_tp_timeout(); prep_backfill_object_push( i->first, i->second.get<0>(), i->second.get<1>(), i->second.get<2>(), - backfill_target, h); + backfill_targets[0], h); } pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority); @@ -8982,7 +8979,7 @@ int ReplicatedPG::recover_backfill( } m->last_backfill = pinfo.last_backfill; m->stats = pinfo.stats; - osd->send_message_osd_cluster(backfill_target, m, get_osdmap()->get_epoch()); + osd->send_message_osd_cluster(backfill_targets[0], m, get_osdmap()->get_epoch()); } dout(10) << " peer num_objects now " << pinfo.stats.stats.sum.num_objects @@ -8999,9 +8996,10 @@ void ReplicatedPG::prep_backfill_object_push( PGBackend::RecoveryHandle *h) { dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl; + assert(!backfill_targets.empty()); backfills_in_flight.insert(oid); - map::iterator bpm = peer_missing.find(get_backfill_target()); + map::iterator bpm = peer_missing.find(backfill_targets[0]); assert(bpm != peer_missing.end()); bpm->second.add(oid, eversion_t(), eversion_t()); diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index d42031b3af0..a9631128aec 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -697,7 +697,11 @@ protected: map pending_backfill_updates; void dump_recovery_info(Formatter *f) const { - f->dump_int("backfill_target", get_backfill_target()); + f->open_array_section("backfill_targets"); + for (vector::const_iterator p = backfill_targets.begin(); + p != backfill_targets.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); f->dump_int("waiting_on_backfill", waiting_on_backfill); f->dump_stream("last_backfill_started") << last_backfill_started; {