From 12b60ed53a609cfdaa33dd6f447ab9920271f605 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Thu, 25 Apr 2019 10:39:22 -0700 Subject: [PATCH] osd/: mechanically rip PeeringState and friends out into their own file The next step is to rationalize the PeeringState<->PG relationship. Signed-off-by: Samuel Just --- src/osd/CMakeLists.txt | 1 + src/osd/OSD.cc | 16 +- src/osd/PG.cc | 2816 +-------------------------------------- src/osd/PG.h | 1011 +------------- src/osd/PeeringState.cc | 2797 ++++++++++++++++++++++++++++++++++++++ src/osd/PeeringState.h | 992 ++++++++++++++ src/osd/PrimaryLogPG.cc | 22 +- src/osd/PrimaryLogPG.h | 1 + 8 files changed, 3847 insertions(+), 3809 deletions(-) create mode 100644 src/osd/PeeringState.cc create mode 100644 src/osd/PeeringState.h diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt index a1b16b59892..74eb6b9b682 100644 --- a/src/osd/CMakeLists.txt +++ b/src/osd/CMakeLists.txt @@ -33,6 +33,7 @@ set(osd_srcs mClockOpClassQueue.cc mClockClientQueue.cc OpQueueItem.cc + PeeringState.cc ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc ${CMAKE_SOURCE_DIR}/src/objclass/class_api.cc ${CMAKE_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 0f2f9314d5f..24fc51b7306 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7394,7 +7394,7 @@ void OSD::handle_scrub(MOSDScrub *m) std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - PG::RequestScrub(m->deep, m->repair)))); + PeeringState::RequestScrub(m->deep, m->repair)))); } m->put(); @@ -7420,7 +7420,7 @@ void OSD::handle_fast_scrub(MOSDScrub2 *m) std::make_shared( m->epoch, m->epoch, - PG::RequestScrub(m->deep, m->repair)))); + PeeringState::RequestScrub(m->deep, m->repair)))); } m->put(); } @@ -9408,7 +9408,7 @@ void OSD::handle_fast_pg_remove(MOSDPGRemove *m) PGPeeringEventRef( std::make_shared( m->get_epoch(), m->get_epoch(), - PG::DeleteStart()))); + PeeringState::DeleteStart()))); } m->put(); } @@ -9429,14 +9429,14 @@ void OSD::handle_fast_force_recovery(MOSDForceRecovery *m) PGPeeringEventRef( std::make_shared( epoch, epoch, - PG::UnsetForceBackfill()))); + PeeringState::UnsetForceBackfill()))); } else { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, - PG::SetForceBackfill()))); + PeeringState::SetForceBackfill()))); } } else if (m->options & OFR_RECOVERY) { if (m->options & OFR_CANCEL) { @@ -9445,14 +9445,14 @@ void OSD::handle_fast_force_recovery(MOSDForceRecovery *m) PGPeeringEventRef( std::make_shared( epoch, epoch, - PG::UnsetForceRecovery()))); + PeeringState::UnsetForceRecovery()))); } else { enqueue_peering_evt( pgid, PGPeeringEventRef( std::make_shared( epoch, epoch, - PG::SetForceRecovery()))); + PeeringState::SetForceRecovery()))); } } } @@ -9827,7 +9827,7 @@ void OSD::dequeue_delete( PGPeeringEventRef( std::make_shared( e, e, - PG::DeleteSome())), + PeeringState::DeleteSome())), handle); } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index e641e68e699..4680fccd526 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2317,7 +2317,7 @@ void PG::all_activated_and_committed() std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - AllReplicasActivated()))); + PeeringState::AllReplicasActivated()))); } bool PG::requeue_scrub(bool high_priority) @@ -4668,9 +4668,9 @@ void PG::schedule_backfill_retry(float delay) std::lock_guard lock(osd->recovery_request_lock); osd->recovery_request_timer.add_event_after( delay, - new QueuePeeringEvt( + new QueuePeeringEvt( this, get_osdmap_epoch(), - RequestBackfill())); + PeeringState::RequestBackfill())); } void PG::schedule_recovery_retry(float delay) @@ -4678,9 +4678,9 @@ void PG::schedule_recovery_retry(float delay) std::lock_guard lock(osd->recovery_request_lock); osd->recovery_request_timer.add_event_after( delay, - new QueuePeeringEvt( + new QueuePeeringEvt( this, get_osdmap_epoch(), - DoRecovery())); + PeeringState::DoRecovery())); } void PG::clear_scrub_reserved() @@ -5938,7 +5938,7 @@ void PG::scrub_finish() std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - DoRecovery()))); + PeeringState::DoRecovery()))); } scrub_clear_state(has_error); @@ -6259,8 +6259,8 @@ void PG::reset_interval_flush() dout(10) << "Clearing blocked outgoing recovery messages" << dendl; recovery_state.clear_blocked_outgoing(); - Context *c = new QueuePeeringEvt( - this, get_osdmap_epoch(), IntervalFlush()); + Context *c = new QueuePeeringEvt( + this, get_osdmap_epoch(), PeeringState::IntervalFlush()); if (!ch->flush_commit(c)) { dout(10) << "Beginning to block outgoing recovery messages" << dendl; recovery_state.begin_block_outgoing(); @@ -6847,7 +6847,7 @@ void PG::find_unfound(epoch_t queued, PeeringCtx *rctx) new PGPeeringEvent( queued, queued, - PG::UnfoundBackfill())); + PeeringState::UnfoundBackfill())); queue_peering_event(evt); action = "in backfill"; } else if (state_test(PG_STATE_RECOVERING)) { @@ -6855,7 +6855,7 @@ void PG::find_unfound(epoch_t queued, PeeringCtx *rctx) new PGPeeringEvent( queued, queued, - PG::UnfoundRecovery())); + PeeringState::UnfoundRecovery())); queue_peering_event(evt); action = "in recovery"; } else { @@ -6885,7 +6885,7 @@ void PG::handle_advance_map( pool.update(cct, osdmap); - AdvMap evt( + PeeringState::AdvMap evt( osdmap, lastmap, newup, up_primary, newacting, acting_primary); recovery_state.handle_event(evt, rctx); @@ -6899,7 +6899,7 @@ void PG::handle_advance_map( void PG::handle_activate_map(PeeringCtx *rctx) { dout(10) << "handle_activate_map " << dendl; - ActMap evt; + PeeringState::ActMap evt; recovery_state.handle_event(evt, rctx); if (osdmap_ref->get_epoch() - last_persisted_osdmap > cct->_conf->osd_pg_epoch_persisted_max_stale) { @@ -6921,14 +6921,14 @@ void PG::handle_activate_map(PeeringCtx *rctx) void PG::handle_initialize(PeeringCtx *rctx) { dout(10) << __func__ << dendl; - Initialize evt; + PeeringState::Initialize evt; recovery_state.handle_event(evt, rctx); } void PG::handle_query_state(Formatter *f) { dout(10) << "handle_query_state" << dendl; - QueryState q(f); + PeeringState::QueryState q(f); recovery_state.handle_event(q, 0); } @@ -6940,23 +6940,15 @@ void PG::update_store_with_options() } } -struct C_DeleteMore : public Context { - PGRef pg; - epoch_t epoch; - C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {} - void finish(int r) override { - ceph_abort(); - } - void complete(int r) override { - ceph_assert(r == 0); - pg->lock(); - if (!pg->pg_has_reset_since(epoch)) { - pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch); - } - pg->unlock(); - delete this; +void PG::C_DeleteMore::complete(int r) { + ceph_assert(r == 0); + pg->lock(); + if (!pg->pg_has_reset_since(epoch)) { + pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch); } -}; + pg->unlock(); + delete this; +} void PG::_delete_some(ObjectStore::Transaction *t) { @@ -7061,14 +7053,6 @@ void PG::_delete_some(ObjectStore::Transaction *t) } } -// Compute pending backfill data -static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) -{ - lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB" - << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl; - return std::max((int64_t)0, bf_bytes - local_bytes); -} - int PG::pg_stat_adjust(osd_stat_t *ns) { osd_stat_t &new_stat = *ns; @@ -7097,2762 +7081,6 @@ int PG::pg_stat_adjust(osd_stat_t *ns) return 0; } - -/*------------ Recovery State Machine----------------*/ -#undef dout_prefix -#define dout_prefix (context< PeeringMachine >().pg->gen_prefix(*_dout) \ - << "state<" << get_state_name() << ">: ") - -/*------Crashed-------*/ -PG::PeeringState::Crashed::Crashed(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Crashed") -{ - context< PeeringMachine >().log_enter(state_name); - ceph_abort_msg("we got a bad state machine event"); -} - - -/*------Initial-------*/ -PG::PeeringState::Initial::Initial(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Initial") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result PG::PeeringState::Initial::react(const MNotifyRec& notify) -{ - PG *pg = context< PeeringMachine >().pg; - pg->proc_replica_info( - notify.from, notify.notify.info, notify.notify.epoch_sent); - pg->set_last_peering_reset(); - return transit< Primary >(); -} - -boost::statechart::result PG::PeeringState::Initial::react(const MInfoRec& i) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!pg->is_primary()); - post_event(i); - return transit< Stray >(); -} - -boost::statechart::result PG::PeeringState::Initial::react(const MLogRec& i) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!pg->is_primary()); - post_event(i); - return transit< Stray >(); -} - -void PG::PeeringState::Initial::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur); -} - -/*------Started-------*/ -PG::PeeringState::Started::Started(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result -PG::PeeringState::Started::react(const IntervalFlush&) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; - context< PeeringMachine >().pg->recovery_state.end_block_outgoing(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Started::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Started advmap" << dendl; - pg->check_full_transition(advmap.lastmap, advmap.osdmap); - if (pg->should_restart_peering( - advmap.up_primary, - advmap.acting_primary, - advmap.newup, - advmap.newacting, - advmap.lastmap, - advmap.osdmap)) { - ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset" - << dendl; - post_event(advmap); - return transit< Reset >(); - } - pg->remove_down_peer_info(advmap.osdmap); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Started::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->close_section(); - return discard_event(); -} - -void PG::PeeringState::Started::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_started_latency, dur); -} - -/*--------Reset---------*/ -PG::PeeringState::Reset::Reset(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Reset") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - - pg->flushes_in_progress = 0; - pg->set_last_peering_reset(); -} - -boost::statechart::result -PG::PeeringState::Reset::react(const IntervalFlush&) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; - context< PeeringMachine >().pg->recovery_state.end_block_outgoing(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Reset::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Reset advmap" << dendl; - - pg->check_full_transition(advmap.lastmap, advmap.osdmap); - - if (pg->should_restart_peering( - advmap.up_primary, - advmap.acting_primary, - advmap.newup, - advmap.newacting, - advmap.lastmap, - advmap.osdmap)) { - ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again" - << dendl; - pg->start_peering_interval( - advmap.lastmap, - advmap.newup, advmap.up_primary, - advmap.newacting, advmap.acting_primary, - context< PeeringMachine >().get_cur_transaction()); - } - pg->remove_down_peer_info(advmap.osdmap); - pg->check_past_interval_bounds(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Reset::react(const ActMap&) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->should_send_notify() && pg->get_primary().osd >= 0) { - context< PeeringMachine >().send_notify( - pg->get_primary(), - pg_notify_t( - pg->get_primary().shard, pg->pg_whoami.shard, - pg->get_osdmap_epoch(), - pg->get_osdmap_epoch(), - pg->info), - pg->past_intervals); - } - - pg->update_heartbeat_peers(); - pg->take_waiters(); - - return transit< Started >(); -} - -boost::statechart::result PG::PeeringState::Reset::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->close_section(); - return discard_event(); -} - -void PG::PeeringState::Reset::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur); -} - -/*-------Start---------*/ -PG::PeeringState::Start::Start(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Start") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - if (pg->is_primary()) { - ldout(pg->cct, 1) << "transitioning to Primary" << dendl; - post_event(MakePrimary()); - } else { //is_stray - ldout(pg->cct, 1) << "transitioning to Stray" << dendl; - post_event(MakeStray()); - } -} - -void PG::PeeringState::Start::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_start_latency, dur); -} - -/*---------Primary--------*/ -PG::PeeringState::Primary::Primary(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - ceph_assert(pg->want_acting.empty()); - - // set CREATING bit until we have peered for the first time. - if (pg->info.history.last_epoch_started == 0) { - pg->state_set(PG_STATE_CREATING); - // use the history timestamp, which ultimately comes from the - // monitor in the create case. - utime_t t = pg->info.history.last_scrub_stamp; - pg->info.stats.last_fresh = t; - pg->info.stats.last_active = t; - pg->info.stats.last_change = t; - pg->info.stats.last_peered = t; - pg->info.stats.last_clean = t; - pg->info.stats.last_unstale = t; - pg->info.stats.last_undegraded = t; - pg->info.stats.last_fullsized = t; - pg->info.stats.last_scrub_stamp = t; - pg->info.stats.last_deep_scrub_stamp = t; - pg->info.stats.last_clean_scrub_stamp = t; - } -} - -boost::statechart::result PG::PeeringState::Primary::react(const MNotifyRec& notevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; - pg->proc_replica_info( - notevt.from, notevt.notify.info, notevt.notify.epoch_sent); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react(const ActMap&) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 7) << "handle ActMap primary" << dendl; - pg->publish_stats_to_osd(); - pg->take_waiters(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react( - const SetForceRecovery&) -{ - PG *pg = context< PeeringMachine >().pg; - pg->set_force_recovery(true); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react( - const UnsetForceRecovery&) -{ - PG *pg = context< PeeringMachine >().pg; - pg->set_force_recovery(false); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react( - const RequestScrub& evt) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->is_primary()) { - pg->unreg_next_scrub(); - pg->scrubber.must_scrub = true; - pg->scrubber.must_deep_scrub = evt.deep || evt.repair; - pg->scrubber.must_repair = evt.repair; - pg->reg_next_scrub(); - ldout(pg->cct,10) << "marking for scrub" << dendl; - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react( - const SetForceBackfill&) -{ - PG *pg = context< PeeringMachine >().pg; - pg->set_force_backfill(true); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Primary::react( - const UnsetForceBackfill&) -{ - PG *pg = context< PeeringMachine >().pg; - pg->set_force_backfill(false); - return discard_event(); -} - -void PG::PeeringState::Primary::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->want_acting.clear(); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur); - pg->clear_primary_state(); - pg->state_clear(PG_STATE_CREATING); -} - -/*---------Peering--------*/ -PG::PeeringState::Peering::Peering(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering"), - history_les_bound(false) -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!pg->is_peered()); - ceph_assert(!pg->is_peering()); - ceph_assert(pg->is_primary()); - pg->state_set(PG_STATE_PEERING); -} - -boost::statechart::result PG::PeeringState::Peering::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Peering advmap" << dendl; - if (prior_set.affected_by_map(*(advmap.osdmap), pg)) { - ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl; - post_event(advmap); - return transit< Reset >(); - } - - pg->adjust_need_up_thru(advmap.osdmap); - - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Peering::react(const QueryState& q) -{ - PG *pg = context< PeeringMachine >().pg; - - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - - q.f->open_array_section("past_intervals"); - pg->past_intervals.dump(q.f); - q.f->close_section(); - - q.f->open_array_section("probing_osds"); - for (set::iterator p = prior_set.probe.begin(); - p != prior_set.probe.end(); - ++p) - q.f->dump_stream("osd") << *p; - q.f->close_section(); - - if (prior_set.pg_down) - q.f->dump_string("blocked", "peering is blocked due to down osds"); - - q.f->open_array_section("down_osds_we_would_probe"); - for (set::iterator p = prior_set.down.begin(); - p != prior_set.down.end(); - ++p) - q.f->dump_int("osd", *p); - q.f->close_section(); - - q.f->open_array_section("peering_blocked_by"); - for (map::iterator p = prior_set.blocked_by.begin(); - p != prior_set.blocked_by.end(); - ++p) { - q.f->open_object_section("osd"); - q.f->dump_int("osd", p->first); - q.f->dump_int("current_lost_at", p->second); - q.f->dump_string("comment", "starting or marking this osd lost may let us proceed"); - q.f->close_section(); - } - q.f->close_section(); - - if (history_les_bound) { - q.f->open_array_section("peering_blocked_by_detail"); - q.f->open_object_section("item"); - q.f->dump_string("detail","peering_blocked_by_history_les_bound"); - q.f->close_section(); - q.f->close_section(); - } - - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::Peering::exit() -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Leaving Peering" << dendl; - context< PeeringMachine >().log_exit(state_name, enter_time); - pg->state_clear(PG_STATE_PEERING); - pg->clear_probe_targets(); - - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur); -} - - -/*------Backfilling-------*/ -PG::PeeringState::Backfilling::Backfilling(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Backfilling") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->backfill_reserved = true; - pg->queue_recovery(); - pg->state_clear(PG_STATE_BACKFILL_TOOFULL); - pg->state_clear(PG_STATE_BACKFILL_WAIT); - pg->state_set(PG_STATE_BACKFILLING); - pg->publish_stats_to_osd(); -} - -void PG::PeeringState::Backfilling::backfill_release_reservations() -{ - PG *pg = context< PeeringMachine >().pg; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - for (set::iterator it = pg->backfill_targets.begin(); - it != pg->backfill_targets.end(); - ++it) { - ceph_assert(*it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - it->osd, pg->get_osdmap_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::RELEASE, - spg_t(pg->info.pgid.pgid, it->shard), - pg->get_osdmap_epoch()), - con.get()); - } - } -} - -void PG::PeeringState::Backfilling::cancel_backfill() -{ - PG *pg = context< PeeringMachine >().pg; - backfill_release_reservations(); - if (!pg->waiting_on_backfill.empty()) { - pg->waiting_on_backfill.clear(); - pg->finish_recovery_op(hobject_t::get_max()); - } -} - -boost::statechart::result -PG::PeeringState::Backfilling::react(const Backfilled &c) -{ - backfill_release_reservations(); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Backfilling::react(const DeferBackfill &c) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl; - pg->state_set(PG_STATE_BACKFILL_WAIT); - pg->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); - pg->schedule_backfill_retry(c.delay); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Backfilling::react(const UnfoundBackfill &c) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; - pg->state_set(PG_STATE_BACKFILL_UNFOUND); - pg->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &) -{ - PG *pg = context< PeeringMachine >().pg; - pg->state_set(PG_STATE_BACKFILL_TOOFULL); - pg->state_clear(PG_STATE_BACKFILLING); - cancel_backfill(); - pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Backfilling::react(const RemoteReservationRevoked &) -{ - PG *pg = context< PeeringMachine >().pg; - pg->state_set(PG_STATE_BACKFILL_WAIT); - cancel_backfill(); - if (pg->needs_backfill()) { - return transit(); - } else { - // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore - return discard_event(); - } -} - -void PG::PeeringState::Backfilling::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->backfill_reserved = false; - pg->backfill_reserving = false; - pg->state_clear(PG_STATE_BACKFILLING); - pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur); -} - -/*--WaitRemoteBackfillReserved--*/ - -PG::PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"), - backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin()) -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->state_set(PG_STATE_BACKFILL_WAIT); - pg->publish_stats_to_osd(); - post_event(RemoteBackfillReserved()); -} - -boost::statechart::result -PG::PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) -{ - PG *pg = context< PeeringMachine >().pg; - - int64_t num_bytes = pg->info.stats.stats.sum.num_bytes; - ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl; - if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) { - //The primary never backfills itself - ceph_assert(*backfill_osd_it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - backfill_osd_it->osd, pg->get_osdmap_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::REQUEST, - spg_t(pg->info.pgid.pgid, backfill_osd_it->shard), - pg->get_osdmap_epoch(), - pg->get_backfill_priority(), - num_bytes, - pg->peer_bytes[*backfill_osd_it]), - con.get()); - } - ++backfill_osd_it; - } else { - pg->peer_bytes.clear(); - post_event(AllBackfillsReserved()); - } - return discard_event(); -} - -void PG::PeeringState::WaitRemoteBackfillReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur); -} - -void PG::PeeringState::WaitRemoteBackfillReserved::retry() -{ - PG *pg = context< PeeringMachine >().pg; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - - // Send CANCEL to all previously acquired reservations - set::const_iterator it, begin, end; - begin = context< Active >().remote_shards_to_reserve_backfill.begin(); - end = context< Active >().remote_shards_to_reserve_backfill.end(); - ceph_assert(begin != end); - for (it = begin; it != backfill_osd_it; ++it) { - //The primary never backfills itself - ceph_assert(*it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - it->osd, pg->get_osdmap_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::RELEASE, - spg_t(pg->info.pgid.pgid, it->shard), - pg->get_osdmap_epoch()), - con.get()); - } - } - - pg->state_clear(PG_STATE_BACKFILL_WAIT); - pg->state_set(PG_STATE_BACKFILL_TOOFULL); - pg->publish_stats_to_osd(); - - pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); -} - -boost::statechart::result -PG::PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt) -{ - retry(); - return transit(); -} - -boost::statechart::result -PG::PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt) -{ - retry(); - return transit(); -} - -/*--WaitLocalBackfillReserved--*/ -PG::PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->state_set(PG_STATE_BACKFILL_WAIT); - pg->osd->local_reserver.request_reservation( - pg->info.pgid, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - LocalBackfillReserved()), - pg->get_backfill_priority(), - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - DeferBackfill(0.0))); - pg->publish_stats_to_osd(); -} - -void PG::PeeringState::WaitLocalBackfillReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur); -} - -/*----NotBackfilling------*/ -PG::PeeringState::NotBackfilling::NotBackfilling(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/NotBackfilling") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_REPAIR); - pg->publish_stats_to_osd(); -} - -boost::statechart::result -PG::PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt) -{ - return discard_event(); -} - -boost::statechart::result -PG::PeeringState::NotBackfilling::react(const RemoteReservationRejected &evt) -{ - return discard_event(); -} - -void PG::PeeringState::NotBackfilling::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_BACKFILL_UNFOUND); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); -} - -/*----NotRecovering------*/ -PG::PeeringState::NotRecovering::NotRecovering(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/NotRecovering") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->publish_stats_to_osd(); -} - -void PG::PeeringState::NotRecovering::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_RECOVERY_UNFOUND); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); -} - -/*---RepNotRecovering----*/ -PG::PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepNotRecovering") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result -PG::PeeringState::RepNotRecovering::react(const RejectRemoteReservation &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->reject_reservation(); - post_event(RemoteReservationRejected()); - return discard_event(); -} - -void PG::PeeringState::RepNotRecovering::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur); -} - -/*---RepWaitRecoveryReserved--*/ -PG::PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result -PG::PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->osd->send_message_osd_cluster( - pg->primary.osd, - new MRecoveryReserve( - MRecoveryReserve::GRANT, - spg_t(pg->info.pgid.pgid, pg->primary.shard), - pg->get_osdmap_epoch()), - pg->get_osdmap_epoch()); - return transit(); -} - -boost::statechart::result -PG::PeeringState::RepWaitRecoveryReserved::react( - const RemoteReservationCanceled &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); - return transit(); -} - -void PG::PeeringState::RepWaitRecoveryReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur); -} - -/*-RepWaitBackfillReserved*/ -PG::PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result -PG::PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt) -{ - PG *pg = context< PeeringMachine >().pg; - // Use tentative_bacfill_full() to make sure enough - // space is available to handle target bytes from primary. - - // TODO: If we passed num_objects from primary we could account for - // an estimate of the metadata overhead. - - // TODO: If we had compressed_allocated and compressed_original from primary - // we could compute compression ratio and adjust accordingly. - - // XXX: There is no way to get omap overhead and this would only apply - // to whatever possibly different partition that is storing the database. - - // update_osd_stat() from heartbeat will do this on a new - // statfs using pg->primary_num_bytes. - uint64_t pending_adjustment = 0; - int64_t primary_num_bytes = evt.primary_num_bytes; - int64_t local_num_bytes = evt.local_num_bytes; - if (primary_num_bytes) { - // For erasure coded pool overestimate by a full stripe per object - // because we don't know how each objected rounded to the nearest stripe - if (pg->pool.info.is_erasure()) { - primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); - primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; - local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); - local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; - } - pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes); - ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB" - << " local " << (local_num_bytes >> 10) << "KiB" - << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" - << dendl; - } - // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes - // That's why we don't immediately unlock - Mutex::Locker l(pg->osd->stat_lock); - osd_stat_t cur_stat = pg->osd->osd_stat; - if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 && - (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { - ldout(pg->cct, 10) << "backfill reservation rejected: failure injection" - << dendl; - post_event(RejectRemoteReservation()); - } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && - pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) { - ldout(pg->cct, 10) << "backfill reservation rejected: backfill full" - << dendl; - post_event(RejectRemoteReservation()); - } else { - Context *preempt = nullptr; - // Don't reserve space if skipped reservation check, this is used - // to test the other backfill full check AND in case a corruption - // of num_bytes requires ignoring that value and trying the - // backfill anyway. - if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) - pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes); - else - pg->clear_reserved_num_bytes(); - // Use un-ec-adjusted bytes for stats. - pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes; - if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { - // older peers will interpret preemption as TOOFULL - preempt = new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - RemoteBackfillPreempted()); - } - pg->osd->remote_reserver.request_reservation( - pg->info.pgid, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - RemoteBackfillReserved()), - evt.priority, - preempt); - } - return transit(); -} - -boost::statechart::result -PG::PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt) -{ - PG *pg = context< PeeringMachine >().pg; - - // fall back to a local reckoning of priority of primary doesn't pass one - // (pre-mimic compat) - int prio = evt.priority ? evt.priority : pg->get_recovery_priority(); - - Context *preempt = nullptr; - if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { - // older peers can't handle this - preempt = new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - RemoteRecoveryPreempted()); - } - - pg->osd->remote_reserver.request_reservation( - pg->info.pgid, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - RemoteRecoveryReserved()), - prio, - preempt); - return transit(); -} - -void PG::PeeringState::RepWaitBackfillReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur); -} - -boost::statechart::result -PG::PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt) -{ - PG *pg = context< PeeringMachine >().pg; - - pg->osd->send_message_osd_cluster( - pg->primary.osd, - new MBackfillReserve( - MBackfillReserve::GRANT, - spg_t(pg->info.pgid.pgid, pg->primary.shard), - pg->get_osdmap_epoch()), - pg->get_osdmap_epoch()); - return transit(); -} - -boost::statechart::result -PG::PeeringState::RepWaitBackfillReserved::react( - const RejectRemoteReservation &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->reject_reservation(); - post_event(RemoteReservationRejected()); - return discard_event(); -} - -boost::statechart::result -PG::PeeringState::RepWaitBackfillReserved::react( - const RemoteReservationRejected &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); - return transit(); -} - -boost::statechart::result -PG::PeeringState::RepWaitBackfillReserved::react( - const RemoteReservationCanceled &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); - return transit(); -} - -/*---RepRecovering-------*/ -PG::PeeringState::RepRecovering::RepRecovering(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepRecovering") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result -PG::PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->send_message_osd_cluster( - pg->primary.osd, - new MRecoveryReserve( - MRecoveryReserve::REVOKE, - spg_t(pg->info.pgid.pgid, pg->primary.shard), - pg->get_osdmap_epoch()), - pg->get_osdmap_epoch()); - return discard_event(); -} - -boost::statechart::result -PG::PeeringState::RepRecovering::react(const BackfillTooFull &) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->send_message_osd_cluster( - pg->primary.osd, - new MBackfillReserve( - MBackfillReserve::TOOFULL, - spg_t(pg->info.pgid.pgid, pg->primary.shard), - pg->get_osdmap_epoch()), - pg->get_osdmap_epoch()); - return discard_event(); -} - -boost::statechart::result -PG::PeeringState::RepRecovering::react(const RemoteBackfillPreempted &) -{ - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->send_message_osd_cluster( - pg->primary.osd, - new MBackfillReserve( - MBackfillReserve::REVOKE, - spg_t(pg->info.pgid.pgid, pg->primary.shard), - pg->get_osdmap_epoch()), - pg->get_osdmap_epoch()); - return discard_event(); -} - -void PG::PeeringState::RepRecovering::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur); -} - -/*------Activating--------*/ -PG::PeeringState::Activating::Activating(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Activating") -{ - context< PeeringMachine >().log_enter(state_name); -} - -void PG::PeeringState::Activating::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur); -} - -PG::PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - - // Make sure all nodes that part of the recovery aren't full - if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery && - pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) { - post_event(RecoveryTooFull()); - return; - } - - pg->state_clear(PG_STATE_RECOVERY_TOOFULL); - pg->state_set(PG_STATE_RECOVERY_WAIT); - pg->osd->local_reserver.request_reservation( - pg->info.pgid, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - LocalRecoveryReserved()), - pg->get_recovery_priority(), - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - DeferRecovery(0.0))); - pg->publish_stats_to_osd(); -} - -boost::statechart::result -PG::PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->state_set(PG_STATE_RECOVERY_TOOFULL); - pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval); - return transit(); -} - -void PG::PeeringState::WaitLocalRecoveryReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur); -} - -PG::PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"), - remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin()) -{ - context< PeeringMachine >().log_enter(state_name); - post_event(RemoteRecoveryReserved()); -} - -boost::statechart::result -PG::PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { - PG *pg = context< PeeringMachine >().pg; - - if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) { - ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - remote_recovery_reservation_it->osd, pg->get_osdmap_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MRecoveryReserve( - MRecoveryReserve::REQUEST, - spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard), - pg->get_osdmap_epoch(), - pg->get_recovery_priority()), - con.get()); - } - ++remote_recovery_reservation_it; - } else { - post_event(AllRemotesReserved()); - } - return discard_event(); -} - -void PG::PeeringState::WaitRemoteRecoveryReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur); -} - -PG::PeeringState::Recovering::Recovering(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Recovering") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_RECOVERY_WAIT); - pg->state_clear(PG_STATE_RECOVERY_TOOFULL); - pg->state_set(PG_STATE_RECOVERING); - ceph_assert(!pg->state_test(PG_STATE_ACTIVATING)); - pg->publish_stats_to_osd(); - pg->queue_recovery(); -} - -void PG::PeeringState::Recovering::release_reservations(bool cancel) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(cancel || !pg->pg_log.get_missing().have_missing()); - - // release remote reservations - for (set::const_iterator i = - context< Active >().remote_shards_to_reserve_recovery.begin(); - i != context< Active >().remote_shards_to_reserve_recovery.end(); - ++i) { - if (*i == pg->pg_whoami) // skip myself - continue; - ConnectionRef con = pg->osd->get_con_osd_cluster( - i->osd, pg->get_osdmap_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MRecoveryReserve( - MRecoveryReserve::RELEASE, - spg_t(pg->info.pgid.pgid, i->shard), - pg->get_osdmap_epoch()), - con.get()); - } - } -} - -boost::statechart::result -PG::PeeringState::Recovering::react(const AllReplicasRecovered &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_FORCED_RECOVERY); - release_reservations(); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Recovering::react(const RequestBackfill &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_FORCED_RECOVERY); - release_reservations(); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - // XXX: Is this needed? - pg->publish_stats_to_osd(); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Recovering::react(const DeferRecovery &evt) -{ - PG *pg = context< PeeringMachine >().pg; - if (!pg->state_test(PG_STATE_RECOVERING)) { - // we may have finished recovery and have an AllReplicasRecovered - // event queued to move us to the next state. - ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl; - return discard_event(); - } - ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl; - pg->state_set(PG_STATE_RECOVERY_WAIT); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - release_reservations(true); - pg->schedule_recovery_retry(evt.delay); - return transit(); -} - -boost::statechart::result -PG::PeeringState::Recovering::react(const UnfoundRecovery &evt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl; - pg->state_set(PG_STATE_RECOVERY_UNFOUND); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - release_reservations(true); - return transit(); -} - -void PG::PeeringState::Recovering::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->state_clear(PG_STATE_RECOVERING); - pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur); -} - -PG::PeeringState::Recovered::Recovered(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Recovered") -{ - pg_shard_t auth_log_shard; - - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - - ceph_assert(!pg->needs_recovery()); - - // if we finished backfill, all acting are active; recheck if - // DEGRADED | UNDERSIZED is appropriate. - ceph_assert(!pg->acting_recovery_backfill.empty()); - if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= - pg->acting_recovery_backfill.size()) { - pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); - pg->publish_stats_to_osd(); - } - - // adjust acting set? (e.g. because backfill completed...) - bool history_les_bound = false; - if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard, - true, &history_les_bound)) { - ceph_assert(pg->want_acting.size()); - } else if (!pg->async_recovery_targets.empty()) { - pg->choose_acting(auth_log_shard, true, &history_les_bound); - } - - if (context< Active >().all_replicas_activated && - pg->async_recovery_targets.empty()) - post_event(GoClean()); -} - -void PG::PeeringState::Recovered::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur); -} - -PG::PeeringState::Clean::Clean(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Clean") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - - if (pg->info.last_complete != pg->info.last_update) { - ceph_abort(); - } - Context *c = pg->finish_recovery(); - context< PeeringMachine >().get_cur_transaction()->register_on_commit(c); - - pg->try_mark_clean(); -} - -void PG::PeeringState::Clean::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->state_clear(PG_STATE_CLEAN); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur); -} - -template -set unique_osd_shard_set(const pg_shard_t & skip, const T &in) -{ - set osds_found; - set out; - for (typename T::const_iterator i = in.begin(); - i != in.end(); - ++i) { - if (*i != skip && !osds_found.count(i->osd)) { - osds_found.insert(i->osd); - out.insert(*i); - } - } - return out; -} - -/*---------Active---------*/ -PG::PeeringState::Active::Active(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Active"), - remote_shards_to_reserve_recovery( - unique_osd_shard_set( - context< PeeringMachine >().pg->pg_whoami, - context< PeeringMachine >().pg->acting_recovery_backfill)), - remote_shards_to_reserve_backfill( - unique_osd_shard_set( - context< PeeringMachine >().pg->pg_whoami, - context< PeeringMachine >().pg->backfill_targets)), - all_replicas_activated(false) -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - - ceph_assert(!pg->backfill_reserving); - ceph_assert(!pg->backfill_reserved); - ceph_assert(pg->is_primary()); - ldout(pg->cct, 10) << "In Active, about to call activate" << dendl; - pg->start_flush(context< PeeringMachine >().get_cur_transaction()); - pg->activate(*context< PeeringMachine >().get_cur_transaction(), - pg->get_osdmap_epoch(), - *context< PeeringMachine >().get_query_map(), - context< PeeringMachine >().get_info_map(), - context< PeeringMachine >().get_recovery_ctx()); - - // everyone has to commit/ack before we are truly active - pg->blocked_by.clear(); - for (set::iterator p = pg->acting_recovery_backfill.begin(); - p != pg->acting_recovery_backfill.end(); - ++p) { - if (p->shard != pg->pg_whoami.shard) { - pg->blocked_by.insert(p->shard); - } - } - pg->publish_stats_to_osd(); - ldout(pg->cct, 10) << "Activate Finished" << dendl; -} - -boost::statechart::result PG::PeeringState::Active::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->should_restart_peering( - advmap.up_primary, - advmap.acting_primary, - advmap.newup, - advmap.newacting, - advmap.lastmap, - advmap.osdmap)) { - ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl; - return forward_event(); - } - ldout(pg->cct, 10) << "Active advmap" << dendl; - bool need_publish = false; - - if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) { - const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps(); - auto i = new_removed_snaps.find(pg->info.pgid.pool()); - if (i != new_removed_snaps.end()) { - bool bad = false; - for (auto j : i->second) { - if (pg->snap_trimq.intersects(j.first, j.second)) { - decltype(pg->snap_trimq) added, overlap; - added.insert(j.first, j.second); - overlap.intersection_of(pg->snap_trimq, added); - if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { - lderr(pg->cct) << __func__ << " removed_snaps already contains " - << overlap << ", but this is the first mimic+ osdmap," - << " so it's expected" << dendl; - } else { - lderr(pg->cct) << __func__ << " removed_snaps already contains " - << overlap << dendl; - bad = true; - } - pg->snap_trimq.union_of(added); - } else { - pg->snap_trimq.insert(j.first, j.second); - } - } - if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { - // at upgrade, we report *all* previously removed snaps as removed in - // the first mimic epoch. remove the ones we previously divined were - // removed (and subsequently purged) from the trimq. - lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps" - << " from new removed_snaps" << dendl; - pg->snap_trimq.subtract(pg->info.purged_snaps); - } - ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second - << ", snap_trimq now " << pg->snap_trimq << dendl; - ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); - pg->dirty_info = true; - pg->dirty_big_info = true; - } - - const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps(); - auto j = new_purged_snaps.find(pg->info.pgid.pool()); - if (j != new_purged_snaps.end()) { - bool bad = false; - for (auto k : j->second) { - if (!pg->info.purged_snaps.contains(k.first, k.second)) { - decltype(pg->info.purged_snaps) rm, overlap; - rm.insert(k.first, k.second); - overlap.intersection_of(pg->info.purged_snaps, rm); - lderr(pg->cct) << __func__ << " purged_snaps does not contain " - << rm << ", only " << overlap << dendl; - pg->info.purged_snaps.subtract(overlap); - // This can currently happen in the normal (if unlikely) course of - // events. Because adding snaps to purged_snaps does not increase - // the pg version or add a pg log entry, we don't reliably propagate - // purged_snaps additions to other OSDs. - // One example: - // - purge S - // - primary and replicas update purged_snaps - // - no object updates - // - pg mapping changes, new primary on different node - // - new primary pg version == eversion_t(), so info is not - // propagated. - //bad = true; - } else { - pg->info.purged_snaps.erase(k.first, k.second); - } - } - ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second - << ", now " << pg->info.purged_snaps << dendl; - ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); - pg->dirty_info = true; - pg->dirty_big_info = true; - } - if (pg->dirty_big_info) { - // share updated purged_snaps to mgr/mon so that we (a) stop reporting - // purged snaps and (b) perhaps share more snaps that we have purged - // but didn't fit in pg_stat_t. - need_publish = true; - pg->share_pg_info(); - } - } else if (!pg->pool.newly_removed_snaps.empty()) { - pg->snap_trimq.union_of(pg->pool.newly_removed_snaps); - ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl; - pg->dirty_info = true; - pg->dirty_big_info = true; - } - - for (size_t i = 0; i < pg->want_acting.size(); i++) { - int osd = pg->want_acting[i]; - if (!advmap.osdmap->is_up(osd)) { - pg_shard_t osd_with_shard(osd, shard_id_t(i)); - ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard)); - } - } - - /* Check for changes in pool size (if the acting set changed as a result, - * this does not matter) */ - if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) != - pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) { - if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) { - pg->state_clear(PG_STATE_UNDERSIZED); - } else { - pg->state_set(PG_STATE_UNDERSIZED); - } - // degraded changes will be detected by call from publish_stats_to_osd() - need_publish = true; - } - - // if we haven't reported our PG stats in a long time, do so now. - if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) { - ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch) - << " epochs" << dendl; - need_publish = true; - } - - if (need_publish) - pg->publish_stats_to_osd(); - - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const ActMap&) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Active: handling ActMap" << dendl; - ceph_assert(pg->is_primary()); - - if (pg->have_unfound()) { - // object may have become unfound - pg->discover_all_missing(*context< PeeringMachine >().get_query_map()); - } - - if (pg->cct->_conf->osd_check_for_log_corruption) - pg->check_log_for_corruption(pg->osd->store); - - uint64_t unfound = pg->missing_loc.num_unfound(); - if (unfound > 0 && - pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) { - if (pg->cct->_conf->osd_auto_mark_unfound_lost) { - pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound - << " objects unfound and apparently lost, would automatically " - << "mark these objects lost but this feature is not yet implemented " - << "(osd_auto_mark_unfound_lost)"; - } else - pg->osd->clog->error() << pg->info.pgid.pgid << " has " - << unfound << " objects unfound and apparently lost"; - } - - if (pg->is_active()) { - ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl; - pg->kick_snap_trim(); - } - - if (pg->is_peered() && - !pg->is_clean() && - !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) && - (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) { - pg->queue_recovery(); - } - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const MNotifyRec& notevt) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(pg->is_primary()); - if (pg->peer_info.count(notevt.from)) { - ldout(pg->cct, 10) << "Active: got notify from " << notevt.from - << ", already have info from that osd, ignoring" - << dendl; - } else if (pg->peer_purged.count(notevt.from)) { - ldout(pg->cct, 10) << "Active: got notify from " << notevt.from - << ", already purged that peer, ignoring" - << dendl; - } else { - ldout(pg->cct, 10) << "Active: got notify from " << notevt.from - << ", calling proc_replica_info and discover_all_missing" - << dendl; - pg->proc_replica_info( - notevt.from, notevt.notify.info, notevt.notify.epoch_sent); - if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) { - pg->discover_all_missing(*context< PeeringMachine >().get_query_map()); - } - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const MTrim& trim) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(pg->is_primary()); - - // peer is informing us of their last_complete_ondisk - ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl; - pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to; - - // trim log when the pg is recovered - pg->calc_min_last_complete_ondisk(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const MInfoRec& infoevt) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(pg->is_primary()); - - ceph_assert(!pg->acting_recovery_backfill.empty()); - // don't update history (yet) if we are active and primary; the replica - // may be telling us they have activated (and committed) but we can't - // share that until _everyone_ does the same. - if (pg->is_acting_recovery_backfill(infoevt.from) && - pg->peer_activated.count(infoevt.from) == 0) { - ldout(pg->cct, 10) << " peer osd." << infoevt.from - << " activated and committed" << dendl; - pg->peer_activated.insert(infoevt.from); - pg->blocked_by.erase(infoevt.from.shard); - pg->publish_stats_to_osd(); - if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) { - pg->all_activated_and_committed(); - } - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "searching osd." << logevt.from - << " log for unfound items" << dendl; - pg->proc_replica_log( - logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); - bool got_missing = pg->search_for_missing( - pg->peer_info[logevt.from], - pg->peer_missing[logevt.from], - logevt.from, - context< PeeringMachine >().get_recovery_ctx()); - // If there are missing AND we are "fully" active then start recovery now - if (got_missing && pg->state_test(PG_STATE_ACTIVE)) { - post_event(DoRecovery()); - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const QueryState& q) -{ - PG *pg = context< PeeringMachine >().pg; - - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - - { - q.f->open_array_section("might_have_unfound"); - for (set::iterator p = pg->might_have_unfound.begin(); - p != pg->might_have_unfound.end(); - ++p) { - q.f->open_object_section("osd"); - q.f->dump_stream("osd") << *p; - if (pg->peer_missing.count(*p)) { - q.f->dump_string("status", "already probed"); - } else if (pg->peer_missing_requested.count(*p)) { - q.f->dump_string("status", "querying"); - } else if (!pg->get_osdmap()->is_up(p->osd)) { - q.f->dump_string("status", "osd is down"); - } else { - q.f->dump_string("status", "not queried"); - } - q.f->close_section(); - } - q.f->close_section(); - } - { - q.f->open_object_section("recovery_progress"); - pg->dump_recovery_info(q.f); - q.f->close_section(); - } - - { - q.f->open_object_section("scrub"); - q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start; - q.f->dump_bool("scrubber.active", pg->scrubber.active); - q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state)); - q.f->dump_stream("scrubber.start") << pg->scrubber.start; - q.f->dump_stream("scrubber.end") << pg->scrubber.end; - q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end; - q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update; - q.f->dump_bool("scrubber.deep", pg->scrubber.deep); - { - q.f->open_array_section("scrubber.waiting_on_whom"); - for (set::iterator p = pg->scrubber.waiting_on_whom.begin(); - p != pg->scrubber.waiting_on_whom.end(); - ++p) { - q.f->dump_stream("shard") << *p; - } - q.f->close_section(); - } - q.f->close_section(); - } - - q.f->close_section(); - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Active::react(const AllReplicasActivated &evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg_t pgid = pg->info.pgid.pgid; - - all_replicas_activated = true; - - pg->state_clear(PG_STATE_ACTIVATING); - pg->state_clear(PG_STATE_CREATING); - pg->state_clear(PG_STATE_PREMERGE); - - bool merge_target; - if (pg->pool.info.is_pending_merge(pgid, &merge_target)) { - pg->state_set(PG_STATE_PEERED); - pg->state_set(PG_STATE_PREMERGE); - - if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) { - if (merge_target) { - pg_t src = pgid; - src.set_ps(pg->pool.info.get_pg_num_pending()); - assert(src.get_parent() == pgid); - pg->osd->set_not_ready_to_merge_target(pgid, src); - } else { - pg->osd->set_not_ready_to_merge_source(pgid); - } - } - } else if (pg->acting.size() < pg->pool.info.min_size) { - pg->state_set(PG_STATE_PEERED); - } else { - pg->state_set(PG_STATE_ACTIVE); - } - - if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) { - pg->osd->send_pg_created(pgid); - } - - pg->info.history.last_epoch_started = pg->info.last_epoch_started; - pg->info.history.last_interval_started = pg->info.last_interval_started; - pg->dirty_info = true; - - pg->share_pg_info(); - pg->publish_stats_to_osd(); - - pg->check_local(); - - // waiters - if (pg->flushes_in_progress == 0) { - pg->requeue_ops(pg->waiting_for_peered); - } else if (!pg->waiting_for_peered.empty()) { - ldout(pg->cct, 10) << __func__ << " flushes in progress, moving " - << pg->waiting_for_peered.size() - << " items to waiting_for_flush" - << dendl; - ceph_assert(pg->waiting_for_flush.empty()); - pg->waiting_for_flush.swap(pg->waiting_for_peered); - } - - pg->on_activate(); - - return discard_event(); -} - -void PG::PeeringState::Active::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - - pg->blocked_by.clear(); - pg->backfill_reserved = false; - pg->backfill_reserving = false; - pg->state_clear(PG_STATE_ACTIVATING); - pg->state_clear(PG_STATE_DEGRADED); - pg->state_clear(PG_STATE_UNDERSIZED); - pg->state_clear(PG_STATE_BACKFILL_TOOFULL); - pg->state_clear(PG_STATE_BACKFILL_WAIT); - pg->state_clear(PG_STATE_RECOVERY_WAIT); - pg->state_clear(PG_STATE_RECOVERY_TOOFULL); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_active_latency, dur); - pg->agent_stop(); -} - -/*------ReplicaActive-----*/ -PG::PeeringState::ReplicaActive::ReplicaActive(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - pg->start_flush(context< PeeringMachine >().get_cur_transaction()); -} - - -boost::statechart::result PG::PeeringState::ReplicaActive::react( - const Activate& actevt) { - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl; - map > query_map; - pg->activate(*context< PeeringMachine >().get_cur_transaction(), - actevt.activation_epoch, - query_map, NULL, NULL); - ldout(pg->cct, 10) << "Activate Finished" << dendl; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react(const MInfoRec& infoevt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->proc_primary_info(*context().get_cur_transaction(), - infoevt.info); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "received log from " << logevt.from << dendl; - ObjectStore::Transaction* t = context().get_cur_transaction(); - pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from); - ceph_assert(pg->pg_log.get_head() == pg->info.last_update); - - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react(const MTrim& trim) -{ - PG *pg = context< PeeringMachine >().pg; - // primary is instructing us to trim - pg->pg_log.trim(trim.trim_to, pg->info); - pg->dirty_info = true; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react(const ActMap&) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->should_send_notify() && pg->get_primary().osd >= 0) { - context< PeeringMachine >().send_notify( - pg->get_primary(), - pg_notify_t( - pg->get_primary().shard, pg->pg_whoami.shard, - pg->get_osdmap_epoch(), - pg->get_osdmap_epoch(), - pg->info), - pg->past_intervals); - } - pg->take_waiters(); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react( - const MQuery& query) -{ - PG *pg = context< PeeringMachine >().pg; - pg->fulfill_query(query, context().get_recovery_ctx()); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::ReplicaActive::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::ReplicaActive::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->clear_reserved_num_bytes(); - pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur); -} - -/*-------Stray---*/ -PG::PeeringState::Stray::Stray(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Stray") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!pg->is_peered()); - ceph_assert(!pg->is_peering()); - ceph_assert(!pg->is_primary()); - - if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) { - ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl; - post_event(DeleteStart()); - } else { - pg->start_flush(context< PeeringMachine >().get_cur_transaction()); - } -} - -boost::statechart::result PG::PeeringState::Stray::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - MOSDPGLog *msg = logevt.msg.get(); - ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl; - - ObjectStore::Transaction* t = context().get_cur_transaction(); - if (msg->info.last_backfill == hobject_t()) { - // restart backfill - pg->unreg_next_scrub(); - pg->info = msg->info; - pg->reg_next_scrub(); - pg->dirty_info = true; - pg->dirty_big_info = true; // maybe. - - PGLogEntryHandler rollbacker{pg, t}; - pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker); - - pg->pg_log.reset_backfill(); - } else { - pg->merge_log(*t, msg->info, msg->log, logevt.from); - } - - ceph_assert(pg->pg_log.get_head() == pg->info.last_update); - - post_event(Activate(logevt.msg->info.last_epoch_started)); - return transit(); -} - -boost::statechart::result PG::PeeringState::Stray::react(const MInfoRec& infoevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; - - if (pg->info.last_update > infoevt.info.last_update) { - // rewind divergent log entries - ObjectStore::Transaction* t = context().get_cur_transaction(); - pg->rewind_divergent_log(*t, infoevt.info.last_update); - pg->info.stats = infoevt.info.stats; - pg->info.hit_set = infoevt.info.hit_set; - } - - ceph_assert(infoevt.info.last_update == pg->info.last_update); - ceph_assert(pg->pg_log.get_head() == pg->info.last_update); - - post_event(Activate(infoevt.info.last_epoch_started)); - return transit(); -} - -boost::statechart::result PG::PeeringState::Stray::react(const MQuery& query) -{ - PG *pg = context< PeeringMachine >().pg; - pg->fulfill_query(query, context().get_recovery_ctx()); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::Stray::react(const ActMap&) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->should_send_notify() && pg->get_primary().osd >= 0) { - context< PeeringMachine >().send_notify( - pg->get_primary(), - pg_notify_t( - pg->get_primary().shard, pg->pg_whoami.shard, - pg->get_osdmap_epoch(), - pg->get_osdmap_epoch(), - pg->info), - pg->past_intervals); - } - pg->take_waiters(); - return discard_event(); -} - -void PG::PeeringState::Stray::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur); -} - - -/*--------ToDelete----------*/ -PG::PeeringState::ToDelete::ToDelete(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ToDelete") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->osd->logger->inc(l_osd_pg_removing); -} - -void PG::PeeringState::ToDelete::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - // note: on a successful removal, this path doesn't execute. see - // _delete_some(). - pg->osd->logger->dec(l_osd_pg_removing); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); -} - -/*----WaitDeleteReserved----*/ -PG::PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, - "Started/ToDelete/WaitDeleteReseved") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - context().priority = pg->get_delete_priority(); - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - pg->osd->local_reserver.request_reservation( - pg->info.pgid, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - DeleteReserved()), - context().priority, - new QueuePeeringEvt( - pg, pg->get_osdmap_epoch(), - DeleteInterrupted())); -} - -boost::statechart::result PG::PeeringState::ToDelete::react( - const ActMap& evt) -{ - PG *pg = context< PeeringMachine >().pg; - if (pg->get_delete_priority() != priority) { - ldout(pg->cct,10) << __func__ << " delete priority changed, resetting" - << dendl; - return transit(); - } - return discard_event(); -} - -void PG::PeeringState::WaitDeleteReserved::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); -} - -/*----Deleting-----*/ -PG::PeeringState::Deleting::Deleting(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/ToDelete/Deleting") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - pg->deleting = true; - ObjectStore::Transaction* t = context().get_cur_transaction(); - pg->on_removal(t); - t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch())); -} - -boost::statechart::result PG::PeeringState::Deleting::react( - const DeleteSome& evt) -{ - PG *pg = context< PeeringMachine >().pg; - pg->_delete_some(context().get_cur_transaction()); - return discard_event(); -} - -void PG::PeeringState::Deleting::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - pg->deleting = false; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); -} - -/*--------GetInfo---------*/ -PG::PeeringState::GetInfo::GetInfo(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/GetInfo") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - pg->check_past_interval_bounds(); - PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; - - ceph_assert(pg->blocked_by.empty()); - - prior_set = pg->build_prior(); - - pg->reset_min_peer_features(); - get_infos(); - if (prior_set.pg_down) { - post_event(IsDown()); - } else if (peer_info_requested.empty()) { - post_event(GotInfo()); - } -} - -void PG::PeeringState::GetInfo::get_infos() -{ - PG *pg = context< PeeringMachine >().pg; - PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; - - pg->blocked_by.clear(); - for (set::const_iterator it = prior_set.probe.begin(); - it != prior_set.probe.end(); - ++it) { - pg_shard_t peer = *it; - if (peer == pg->pg_whoami) { - continue; - } - if (pg->peer_info.count(peer)) { - ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl; - continue; - } - if (peer_info_requested.count(peer)) { - ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl; - pg->blocked_by.insert(peer.osd); - } else if (!pg->get_osdmap()->is_up(peer.osd)) { - ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl; - } else { - ldout(pg->cct, 10) << " querying info from osd." << peer << dendl; - context< PeeringMachine >().send_query( - peer, pg_query_t(pg_query_t::INFO, - it->shard, pg->pg_whoami.shard, - pg->info.history, - pg->get_osdmap_epoch())); - peer_info_requested.insert(peer); - pg->blocked_by.insert(peer.osd); - } - } - - pg->publish_stats_to_osd(); -} - -boost::statechart::result PG::PeeringState::GetInfo::react(const MNotifyRec& infoevt) -{ - PG *pg = context< PeeringMachine >().pg; - - set::iterator p = peer_info_requested.find(infoevt.from); - if (p != peer_info_requested.end()) { - peer_info_requested.erase(p); - pg->blocked_by.erase(infoevt.from.osd); - } - - epoch_t old_start = pg->info.history.last_epoch_started; - if (pg->proc_replica_info( - infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) { - // we got something new ... - PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; - if (old_start < pg->info.history.last_epoch_started) { - ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl; - prior_set = pg->build_prior(); - - // filter out any osds that got dropped from the probe set from - // peer_info_requested. this is less expensive than restarting - // peering (which would re-probe everyone). - set::iterator p = peer_info_requested.begin(); - while (p != peer_info_requested.end()) { - if (prior_set.probe.count(*p) == 0) { - ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; - peer_info_requested.erase(p++); - } else { - ++p; - } - } - get_infos(); - } - ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: " - << hex << infoevt.features << dec << dendl; - pg->apply_peer_features(infoevt.features); - - // are we done getting everything? - if (peer_info_requested.empty() && !prior_set.pg_down) { - ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl; - ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl; - ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl; - post_event(GotInfo()); - } - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::GetInfo::react(const QueryState& q) -{ - PG *pg = context< PeeringMachine >().pg; - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - - q.f->open_array_section("requested_info_from"); - for (set::iterator p = peer_info_requested.begin(); - p != peer_info_requested.end(); - ++p) { - q.f->open_object_section("osd"); - q.f->dump_stream("osd") << *p; - if (pg->peer_info.count(*p)) { - q.f->open_object_section("got_info"); - pg->peer_info[*p].dump(q.f); - q.f->close_section(); - } - q.f->close_section(); - } - q.f->close_section(); - - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::GetInfo::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur); - pg->blocked_by.clear(); -} - -/*------GetLog------------*/ -PG::PeeringState::GetLog::GetLog(my_context ctx) - : my_base(ctx), - NamedState( - context< PeeringMachine >().pg, "Started/Primary/Peering/GetLog"), - msg(0) -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - - // adjust acting? - if (!pg->choose_acting(auth_log_shard, false, - &context< Peering >().history_les_bound)) { - if (!pg->want_acting.empty()) { - post_event(NeedActingChange()); - } else { - post_event(IsIncomplete()); - } - return; - } - - // am i the best? - if (auth_log_shard == pg->pg_whoami) { - post_event(GotLog()); - return; - } - - const pg_info_t& best = pg->peer_info[auth_log_shard]; - - // am i broken? - if (pg->info.last_update < best.log_tail) { - ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; - post_event(IsIncomplete()); - return; - } - - // how much log to request? - eversion_t request_log_from = pg->info.last_update; - ceph_assert(!pg->acting_recovery_backfill.empty()); - for (set::iterator p = pg->acting_recovery_backfill.begin(); - p != pg->acting_recovery_backfill.end(); - ++p) { - if (*p == pg->pg_whoami) continue; - pg_info_t& ri = pg->peer_info[*p]; - if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail && - ri.last_update < request_log_from) - request_log_from = ri.last_update; - } - - // how much? - ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl; - context().send_query( - auth_log_shard, - pg_query_t( - pg_query_t::LOG, - auth_log_shard.shard, pg->pg_whoami.shard, - request_log_from, pg->info.history, - pg->get_osdmap_epoch())); - - ceph_assert(pg->blocked_by.empty()); - pg->blocked_by.insert(auth_log_shard.osd); - pg->publish_stats_to_osd(); -} - -boost::statechart::result PG::PeeringState::GetLog::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - // make sure our log source didn't go down. we need to check - // explicitly because it may not be part of the prior set, which - // means the Peering state check won't catch it going down. - if (!advmap.osdmap->is_up(auth_log_shard.osd)) { - ldout(pg->cct, 10) << "GetLog: auth_log_shard osd." - << auth_log_shard.osd << " went down" << dendl; - post_event(advmap); - return transit< Reset >(); - } - - // let the Peering state do its checks. - return forward_event(); -} - -boost::statechart::result PG::PeeringState::GetLog::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!msg); - if (logevt.from != auth_log_shard) { - ldout(pg->cct, 10) << "GetLog: discarding log from " - << "non-auth_log_shard osd." << logevt.from << dendl; - return discard_event(); - } - ldout(pg->cct, 10) << "GetLog: received master log from osd." - << logevt.from << dendl; - msg = logevt.msg; - post_event(GotLog()); - return discard_event(); -} - -boost::statechart::result PG::PeeringState::GetLog::react(const GotLog&) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "leaving GetLog" << dendl; - if (msg) { - ldout(pg->cct, 10) << "processing master log" << dendl; - pg->proc_master_log(*context().get_cur_transaction(), - msg->info, msg->log, msg->missing, - auth_log_shard); - } - pg->start_flush(context< PeeringMachine >().get_cur_transaction()); - return transit< GetMissing >(); -} - -boost::statechart::result PG::PeeringState::GetLog::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->dump_stream("auth_log_shard") << auth_log_shard; - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::GetLog::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur); - pg->blocked_by.clear(); -} - -/*------WaitActingChange--------*/ -PG::PeeringState::WaitActingChange::WaitActingChange(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/WaitActingChange") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result PG::PeeringState::WaitActingChange::react(const AdvMap& advmap) -{ - PG *pg = context< PeeringMachine >().pg; - OSDMapRef osdmap = advmap.osdmap; - - ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl; - for (vector::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) { - if (!osdmap->is_up(*p)) { - ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl; - post_event(advmap); - return transit< Reset >(); - } - } - return forward_event(); -} - -boost::statechart::result PG::PeeringState::WaitActingChange::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::WaitActingChange::react(const MInfoRec& evt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::WaitActingChange::react(const MNotifyRec& evt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::WaitActingChange::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->dump_string("comment", "waiting for pg acting set to change"); - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::WaitActingChange::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur); -} - -/*------Down--------*/ -PG::PeeringState::Down::Down(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/Down") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - - pg->state_clear(PG_STATE_PEERING); - pg->state_set(PG_STATE_DOWN); - - auto &prior_set = context< Peering >().prior_set; - ceph_assert(pg->blocked_by.empty()); - pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); - pg->publish_stats_to_osd(); -} - -void PG::PeeringState::Down::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - - pg->state_clear(PG_STATE_DOWN); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_down_latency, dur); - - pg->blocked_by.clear(); -} - -boost::statechart::result PG::PeeringState::Down::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->dump_string("comment", - "not enough up instances of this PG to go active"); - q.f->close_section(); - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Down::react(const MNotifyRec& infoevt) -{ - PG *pg = context< PeeringMachine >().pg; - - ceph_assert(pg->is_primary()); - epoch_t old_start = pg->info.history.last_epoch_started; - if (!pg->peer_info.count(infoevt.from) && - pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) { - pg->update_history(infoevt.notify.info.history); - } - // if we got something new to make pg escape down state - if (pg->info.history.last_epoch_started > old_start) { - ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl; - pg->state_clear(PG_STATE_DOWN); - pg->state_set(PG_STATE_PEERING); - return transit< GetInfo >(); - } - - return discard_event(); -} - - -/*------Incomplete--------*/ -PG::PeeringState::Incomplete::Incomplete(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/Incomplete") -{ - context< PeeringMachine >().log_enter(state_name); - PG *pg = context< PeeringMachine >().pg; - - pg->state_clear(PG_STATE_PEERING); - pg->state_set(PG_STATE_INCOMPLETE); - - PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; - ceph_assert(pg->blocked_by.empty()); - pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); - pg->publish_stats_to_osd(); -} - -boost::statechart::result PG::PeeringState::Incomplete::react(const AdvMap &advmap) { - PG *pg = context< PeeringMachine >().pg; - int64_t poolnum = pg->info.pgid.pool(); - - // Reset if min_size turn smaller than previous value, pg might now be able to go active - if (!advmap.osdmap->have_pg_pool(poolnum) || - advmap.lastmap->get_pools().find(poolnum)->second.min_size > - advmap.osdmap->get_pools().find(poolnum)->second.min_size) { - post_event(advmap); - return transit< Reset >(); - } - - return forward_event(); -} - -boost::statechart::result PG::PeeringState::Incomplete::react(const MNotifyRec& notevt) { - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; - if (pg->proc_replica_info( - notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) { - // We got something new, try again! - return transit< GetLog >(); - } else { - return discard_event(); - } -} - -boost::statechart::result PG::PeeringState::Incomplete::react( - const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->dump_string("comment", "not enough complete instances of this PG"); - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::Incomplete::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - - pg->state_clear(PG_STATE_INCOMPLETE); - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur); - - pg->blocked_by.clear(); -} - -/*------GetMissing--------*/ -PG::PeeringState::GetMissing::GetMissing(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/GetMissing") -{ - context< PeeringMachine >().log_enter(state_name); - - PG *pg = context< PeeringMachine >().pg; - ceph_assert(!pg->acting_recovery_backfill.empty()); - eversion_t since; - for (set::iterator i = pg->acting_recovery_backfill.begin(); - i != pg->acting_recovery_backfill.end(); - ++i) { - if (*i == pg->get_primary()) continue; - const pg_info_t& pi = pg->peer_info[*i]; - // reset this so to make sure the pg_missing_t is initialized and - // has the correct semantics even if we don't need to get a - // missing set from a shard. This way later additions due to - // lost+unfound delete work properly. - pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering(); - - if (pi.is_empty()) - continue; // no pg data, nothing divergent - - if (pi.last_update < pg->pg_log.get_tail()) { - ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl; - pg->peer_missing[*i].clear(); - continue; - } - if (pi.last_backfill == hobject_t()) { - ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl; - pg->peer_missing[*i].clear(); - continue; - } - - if (pi.last_update == pi.last_complete && // peer has no missing - pi.last_update == pg->info.last_update) { // peer is up to date - // replica has no missing and identical log as us. no need to - // pull anything. - // FIXME: we can do better here. if last_update==last_complete we - // can infer the rest! - ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl; - pg->peer_missing[*i].clear(); - continue; - } - - // We pull the log from the peer's last_epoch_started to ensure we - // get enough log to detect divergent updates. - since.epoch = pi.last_epoch_started; - ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing - if (pi.log_tail <= since) { - ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl; - context< PeeringMachine >().send_query( - *i, - pg_query_t( - pg_query_t::LOG, - i->shard, pg->pg_whoami.shard, - since, pg->info.history, - pg->get_osdmap_epoch())); - } else { - ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i - << " (want since " << since << " < log.tail " - << pi.log_tail << ")" << dendl; - context< PeeringMachine >().send_query( - *i, pg_query_t( - pg_query_t::FULLLOG, - i->shard, pg->pg_whoami.shard, - pg->info.history, pg->get_osdmap_epoch())); - } - peer_missing_requested.insert(*i); - pg->blocked_by.insert(i->osd); - } - - if (peer_missing_requested.empty()) { - if (pg->need_up_thru) { - ldout(pg->cct, 10) << " still need up_thru update before going active" - << dendl; - post_event(NeedUpThru()); - return; - } - - // all good! - post_event(Activate(pg->get_osdmap_epoch())); - } else { - pg->publish_stats_to_osd(); - } -} - -boost::statechart::result PG::PeeringState::GetMissing::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - - peer_missing_requested.erase(logevt.from); - pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); - - if (peer_missing_requested.empty()) { - if (pg->need_up_thru) { - ldout(pg->cct, 10) << " still need up_thru update before going active" - << dendl; - post_event(NeedUpThru()); - } else { - ldout(pg->cct, 10) << "Got last missing, don't need missing " - << "posting Activate" << dendl; - post_event(Activate(pg->get_osdmap_epoch())); - } - } - return discard_event(); -} - -boost::statechart::result PG::PeeringState::GetMissing::react(const QueryState& q) -{ - PG *pg = context< PeeringMachine >().pg; - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - - q.f->open_array_section("peer_missing_requested"); - for (set::iterator p = peer_missing_requested.begin(); - p != peer_missing_requested.end(); - ++p) { - q.f->open_object_section("osd"); - q.f->dump_stream("osd") << *p; - if (pg->peer_missing.count(*p)) { - q.f->open_object_section("got_missing"); - pg->peer_missing[*p].dump(q.f); - q.f->close_section(); - } - q.f->close_section(); - } - q.f->close_section(); - - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::GetMissing::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur); - pg->blocked_by.clear(); -} - -/*------WaitUpThru--------*/ -PG::PeeringState::WaitUpThru::WaitUpThru(my_context ctx) - : my_base(ctx), - NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/WaitUpThru") -{ - context< PeeringMachine >().log_enter(state_name); -} - -boost::statechart::result PG::PeeringState::WaitUpThru::react(const ActMap& am) -{ - PG *pg = context< PeeringMachine >().pg; - if (!pg->need_up_thru) { - post_event(Activate(pg->get_osdmap_epoch())); - } - return forward_event(); -} - -boost::statechart::result PG::PeeringState::WaitUpThru::react(const MLogRec& logevt) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl; - pg->peer_missing[logevt.from].claim(logevt.msg->missing); - pg->peer_info[logevt.from] = logevt.msg->info; - return discard_event(); -} - -boost::statechart::result PG::PeeringState::WaitUpThru::react(const QueryState& q) -{ - q.f->open_object_section("state"); - q.f->dump_string("name", state_name); - q.f->dump_stream("enter_time") << enter_time; - q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd"); - q.f->close_section(); - return forward_event(); -} - -void PG::PeeringState::WaitUpThru::exit() -{ - context< PeeringMachine >().log_exit(state_name, enter_time); - PG *pg = context< PeeringMachine >().pg; - utime_t dur = ceph_clock_now() - enter_time; - pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur); -} - -/*----PeeringState::PeeringMachine Methods-----*/ -#undef dout_prefix -#define dout_prefix pg->gen_prefix(*_dout) - -void PG::PeeringState::PeeringMachine::log_enter(const char *state_name) -{ - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 5) << "enter " << state_name << dendl; - pg->osd->pg_recovery_stats.log_enter(state_name); -} - -void PG::PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time) -{ - utime_t dur = ceph_clock_now() - enter_time; - PG *pg = context< PeeringMachine >().pg; - ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl; - pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time, - event_count, event_time); - event_count = 0; - event_time = utime_t(); -} - - -/*---------------------------------------------------*/ -#undef dout_prefix -#define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ") - -void PG::PeeringState::start_handle(PeeringCtx *new_ctx) { - ceph_assert(!rctx); - ceph_assert(!orig_ctx); - orig_ctx = new_ctx; - if (new_ctx) { - if (messages_pending_flush) { - rctx = PeeringCtx(*messages_pending_flush, *new_ctx); - } else { - rctx = *new_ctx; - } - rctx->start_time = ceph_clock_now(); - } -} - -void PG::PeeringState::begin_block_outgoing() { - ceph_assert(!messages_pending_flush); - ceph_assert(orig_ctx); - ceph_assert(rctx); - messages_pending_flush = BufferedRecoveryMessages(); - rctx = PeeringCtx(*messages_pending_flush, *orig_ctx); -} - -void PG::PeeringState::clear_blocked_outgoing() { - ceph_assert(orig_ctx); - ceph_assert(rctx); - messages_pending_flush = boost::optional(); -} - -void PG::PeeringState::end_block_outgoing() { - ceph_assert(messages_pending_flush); - ceph_assert(orig_ctx); - ceph_assert(rctx); - - rctx = PeeringCtx(*orig_ctx); - rctx->accept_buffered_messages(*messages_pending_flush); - messages_pending_flush = boost::optional(); -} - -void PG::PeeringState::end_handle() { - if (rctx) { - utime_t dur = ceph_clock_now() - rctx->start_time; - machine.event_time += dur; - } - - machine.event_count++; - rctx = boost::optional(); - orig_ctx = NULL; -} - ostream& operator<<(ostream& out, const PG::BackfillInterval& bi) { out << "BackfillInfo(" << bi.begin << "-" << bi.end diff --git a/src/osd/PG.h b/src/osd/PG.h index 1dbe23899c3..8d94f867a7a 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -44,6 +44,7 @@ #include "include/str_list.h" #include "PGBackend.h" #include "PGPeeringEvent.h" +#include "PeeringState.h" #include "mgr/OSDPerfMetricTypes.h" @@ -256,15 +257,16 @@ struct PGPool { */ class PG : public DoutPrefixProvider { + friend class PeeringState; public: + using PeeringCtx = PeeringState::PeeringCtx; + // -- members -- const spg_t pg_id; const coll_t coll; ObjectStore::CollectionHandle ch; - struct PeeringCtx; - // -- methods -- std::ostream& gen_prefix(std::ostream& out) const override; CephContext *get_cct() const override { @@ -493,6 +495,16 @@ public: virtual void on_removal(ObjectStore::Transaction *t) = 0; + struct C_DeleteMore : public Context { + PGRef pg; + epoch_t epoch; + C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {} + void finish(int r) override { + ceph_abort(); + } + void complete(int r) override; + }; + void _delete_some(ObjectStore::Transaction *t); virtual void set_dynamic_perf_stats_queries( @@ -985,101 +997,12 @@ protected: set blocked_by; ///< osds we are blocked by (for pg stats) -protected: - // [primary only] content recovery state - struct BufferedRecoveryMessages { - map > query_map; - map > > info_map; - map > > notify_list; - }; - public: bool dne() { return info.dne(); } - struct PeeringCtx { - utime_t start_time; - map > *query_map; - map > > *info_map; - map > > *notify_list; - ObjectStore::Transaction *transaction; - ThreadPool::TPHandle* handle; - PeeringCtx(map > *query_map, - map > > *info_map, - map > > *notify_list, - ObjectStore::Transaction *transaction) - : query_map(query_map), info_map(info_map), - notify_list(notify_list), - transaction(transaction), - handle(NULL) {} - - PeeringCtx(BufferedRecoveryMessages &buf, PeeringCtx &rctx) - : query_map(&(buf.query_map)), - info_map(&(buf.info_map)), - notify_list(&(buf.notify_list)), - transaction(rctx.transaction), - handle(rctx.handle) {} - - void accept_buffered_messages(BufferedRecoveryMessages &m) { - ceph_assert(query_map); - ceph_assert(info_map); - ceph_assert(notify_list); - for (map >::iterator i = m.query_map.begin(); - i != m.query_map.end(); - ++i) { - map &omap = (*query_map)[i->first]; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - ++j) { - omap[j->first] = j->second; - } - } - for (map > >::iterator i - = m.info_map.begin(); - i != m.info_map.end(); - ++i) { - vector > &ovec = - (*info_map)[i->first]; - ovec.reserve(ovec.size() + i->second.size()); - ovec.insert(ovec.end(), i->second.begin(), i->second.end()); - } - for (map > >::iterator i - = m.notify_list.begin(); - i != m.notify_list.end(); - ++i) { - vector > &ovec = - (*notify_list)[i->first]; - ovec.reserve(ovec.size() + i->second.size()); - ovec.insert(ovec.end(), i->second.begin(), i->second.end()); - } - } - void send_notify(pg_shard_t to, - const pg_notify_t &info, const PastIntervals &pi) { - ceph_assert(notify_list); - (*notify_list)[to.osd].emplace_back(info, pi); - } - }; protected: - PGStateHistory pgstate_history; - struct NamedState { - const char *state_name; - utime_t enter_time; - PG* pg; - const char *get_state_name() { return state_name; } - NamedState(PG *pg_, const char *state_name_) - : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) { - pg->pgstate_history.enter(pg, enter_time, state_name); - } - virtual ~NamedState() { pg->pgstate_history.exit(state_name); } - }; - - - -protected: - /* * peer_info -- projected (updates _before_ replicas ack) * peer_missing -- committed (updates _after_ replicas ack) @@ -1960,915 +1883,11 @@ protected: }; - struct QueryState : boost::statechart::event< QueryState > { - Formatter *f; - explicit QueryState(Formatter *f) : f(f) {} - void print(std::ostream *out) const { - *out << "Query"; - } - }; - public: int pg_stat_adjust(osd_stat_t *new_stat); protected: - struct AdvMap : boost::statechart::event< AdvMap > { - OSDMapRef osdmap; - OSDMapRef lastmap; - vector newup, newacting; - int up_primary, acting_primary; - AdvMap( - OSDMapRef osdmap, OSDMapRef lastmap, - vector& newup, int up_primary, - vector& newacting, int acting_primary): - osdmap(osdmap), lastmap(lastmap), - newup(newup), - newacting(newacting), - up_primary(up_primary), - acting_primary(acting_primary) {} - void print(std::ostream *out) const { - *out << "AdvMap"; - } - }; - - struct ActMap : boost::statechart::event< ActMap > { - ActMap() : boost::statechart::event< ActMap >() {} - void print(std::ostream *out) const { - *out << "ActMap"; - } - }; - struct Activate : boost::statechart::event< Activate > { - epoch_t activation_epoch; - explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), - activation_epoch(q) {} - void print(std::ostream *out) const { - *out << "Activate from " << activation_epoch; - } - }; -public: - struct UnfoundBackfill : boost::statechart::event { - explicit UnfoundBackfill() {} - void print(std::ostream *out) const { - *out << "UnfoundBackfill"; - } - }; - struct UnfoundRecovery : boost::statechart::event { - explicit UnfoundRecovery() {} - void print(std::ostream *out) const { - *out << "UnfoundRecovery"; - } - }; - - struct RequestScrub : boost::statechart::event { - bool deep; - bool repair; - explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {} - void print(std::ostream *out) const { - *out << "RequestScrub(" << (deep ? "deep" : "shallow") - << (repair ? " repair" : ""); - } - }; - -protected: - TrivialEvent(Initialize) - TrivialEvent(GotInfo) - TrivialEvent(NeedUpThru) - TrivialEvent(Backfilled) - TrivialEvent(LocalBackfillReserved) - TrivialEvent(RejectRemoteReservation) - public: - TrivialEvent(RequestBackfill) - protected: - TrivialEvent(RemoteRecoveryPreempted) - TrivialEvent(RemoteBackfillPreempted) - TrivialEvent(BackfillTooFull) - TrivialEvent(RecoveryTooFull) - - TrivialEvent(MakePrimary) - TrivialEvent(MakeStray) - TrivialEvent(NeedActingChange) - TrivialEvent(IsIncomplete) - TrivialEvent(IsDown) - - TrivialEvent(AllReplicasRecovered) - TrivialEvent(DoRecovery) - TrivialEvent(LocalRecoveryReserved) - public: - protected: - TrivialEvent(AllRemotesReserved) - TrivialEvent(AllBackfillsReserved) - TrivialEvent(GoClean) - - TrivialEvent(AllReplicasActivated) - - TrivialEvent(IntervalFlush) - - public: - TrivialEvent(DeleteStart) - TrivialEvent(DeleteSome) - - TrivialEvent(SetForceRecovery) - TrivialEvent(UnsetForceRecovery) - TrivialEvent(SetForceBackfill) - TrivialEvent(UnsetForceBackfill) - - protected: - TrivialEvent(DeleteReserved) - TrivialEvent(DeleteInterrupted) - - /* Encapsulates PG recovery process */ - class PeeringState { - void start_handle(RecoveryCtx *new_ctx); - void end_handle(); - public: - void begin_block_outgoing(); - void end_block_outgoing(); - void clear_blocked_outgoing(); - private: - - /* States */ - struct Initial; - class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > { - PeeringState *state; - public: - PG *pg; - - utime_t event_time; - uint64_t event_count; - - void clear_event_counters() { - event_time = utime_t(); - event_count = 0; - } - - void log_enter(const char *state_name); - void log_exit(const char *state_name, utime_t duration); - - PeeringMachine(PeeringState *state, PG *pg) : state(state), pg(pg), event_count(0) {} - - /* Accessor functions for state methods */ - ObjectStore::Transaction* get_cur_transaction() { - ceph_assert(state->rctx); - ceph_assert(state->rctx->transaction); - return state->rctx->transaction; - } - - void send_query(pg_shard_t to, const pg_query_t &query) { - ceph_assert(state->rctx); - ceph_assert(state->rctx->query_map); - (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = - query; - } - - map > *get_query_map() { - ceph_assert(state->rctx); - ceph_assert(state->rctx->query_map); - return state->rctx->query_map; - } - - map > > *get_info_map() { - ceph_assert(state->rctx); - ceph_assert(state->rctx->info_map); - return state->rctx->info_map; - } - - RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); } - - void send_notify(pg_shard_t to, - const pg_notify_t &info, const PastIntervals &pi) { - ceph_assert(state->rctx); - state->rctx->send_notify(to, info, pi); - } - }; - friend class PeeringMachine; - - /* States */ - // Initial - // Reset - // Start - // Started - // Primary - // WaitActingChange - // Peering - // GetInfo - // GetLog - // GetMissing - // WaitUpThru - // Incomplete - // Active - // Activating - // Clean - // Recovered - // Backfilling - // WaitRemoteBackfillReserved - // WaitLocalBackfillReserved - // NotBackfilling - // NotRecovering - // Recovering - // WaitRemoteRecoveryReserved - // WaitLocalRecoveryReserved - // ReplicaActive - // RepNotRecovering - // RepRecovering - // RepWaitBackfillReserved - // RepWaitRecoveryReserved - // Stray - // ToDelete - // WaitDeleteReserved - // Deleting - // Crashed - - struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState { - explicit Crashed(my_context ctx); - }; - - struct Reset; - - struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState { - explicit Initial(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::transition< Initialize, Reset >, - boost::statechart::custom_reaction< NullEvt >, - boost::statechart::transition< boost::statechart::event_base, Crashed > - > reactions; - - boost::statechart::result react(const MNotifyRec&); - boost::statechart::result react(const MInfoRec&); - boost::statechart::result react(const MLogRec&); - boost::statechart::result react(const boost::statechart::event_base&) { - return discard_event(); - } - }; - - struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState { - explicit Reset(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< NullEvt >, - boost::statechart::custom_reaction< IntervalFlush >, - boost::statechart::transition< boost::statechart::event_base, Crashed > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const AdvMap&); - boost::statechart::result react(const ActMap&); - boost::statechart::result react(const IntervalFlush&); - boost::statechart::result react(const boost::statechart::event_base&) { - return discard_event(); - } - }; - - struct Start; - - struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState { - explicit Started(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::custom_reaction< IntervalFlush >, - // ignored - boost::statechart::custom_reaction< NullEvt >, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - // crash - boost::statechart::transition< boost::statechart::event_base, Crashed > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const AdvMap&); - boost::statechart::result react(const IntervalFlush&); - boost::statechart::result react(const boost::statechart::event_base&) { - return discard_event(); - } - }; - - struct Primary; - struct Stray; - - struct Start : boost::statechart::state< Start, Started >, NamedState { - explicit Start(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::transition< MakePrimary, Primary >, - boost::statechart::transition< MakeStray, Stray > - > reactions; - }; - - struct Peering; - struct WaitActingChange; - struct Incomplete; - struct Down; - - struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { - explicit Primary(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< MNotifyRec >, - boost::statechart::transition< NeedActingChange, WaitActingChange >, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction - > reactions; - boost::statechart::result react(const ActMap&); - boost::statechart::result react(const MNotifyRec&); - boost::statechart::result react(const SetForceRecovery&); - boost::statechart::result react(const UnsetForceRecovery&); - boost::statechart::result react(const SetForceBackfill&); - boost::statechart::result react(const UnsetForceBackfill&); - boost::statechart::result react(const RequestScrub&); - }; - - struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, - NamedState { - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::custom_reaction< MInfoRec >, - boost::statechart::custom_reaction< MNotifyRec > - > reactions; - explicit WaitActingChange(my_context ctx); - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const AdvMap&); - boost::statechart::result react(const MLogRec&); - boost::statechart::result react(const MInfoRec&); - boost::statechart::result react(const MNotifyRec&); - void exit(); - }; - - struct GetInfo; - struct Active; - - struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { - PastIntervals::PriorSet prior_set; - bool history_les_bound; //< need osd_find_best_info_ignore_history_les - - explicit Peering(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::transition< Activate, Active >, - boost::statechart::custom_reaction< AdvMap > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const AdvMap &advmap); - }; - - struct WaitLocalRecoveryReserved; - struct Activating; - struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { - explicit Active(my_context ctx); - void exit(); - - const set remote_shards_to_reserve_recovery; - const set remote_shards_to_reserve_backfill; - bool all_replicas_activated; - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::custom_reaction< MInfoRec >, - boost::statechart::custom_reaction< MNotifyRec >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::custom_reaction< MTrim >, - boost::statechart::custom_reaction< Backfilled >, - boost::statechart::custom_reaction< AllReplicasActivated >, - boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< DeferBackfill >, - boost::statechart::custom_reaction< UnfoundRecovery >, - boost::statechart::custom_reaction< UnfoundBackfill >, - boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, - boost::statechart::custom_reaction< RemoteReservationRevoked>, - boost::statechart::custom_reaction< DoRecovery> - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const ActMap&); - boost::statechart::result react(const AdvMap&); - boost::statechart::result react(const MInfoRec& infoevt); - boost::statechart::result react(const MNotifyRec& notevt); - boost::statechart::result react(const MLogRec& logevt); - boost::statechart::result react(const MTrim& trimevt); - boost::statechart::result react(const Backfilled&) { - return discard_event(); - } - boost::statechart::result react(const AllReplicasActivated&); - boost::statechart::result react(const DeferRecovery& evt) { - return discard_event(); - } - boost::statechart::result react(const DeferBackfill& evt) { - return discard_event(); - } - boost::statechart::result react(const UnfoundRecovery& evt) { - return discard_event(); - } - boost::statechart::result react(const UnfoundBackfill& evt) { - return discard_event(); - } - boost::statechart::result react(const RemoteReservationRevokedTooFull&) { - return discard_event(); - } - boost::statechart::result react(const RemoteReservationRevoked&) { - return discard_event(); - } - boost::statechart::result react(const DoRecovery&) { - return discard_event(); - } - }; - - struct Clean : boost::statechart::state< Clean, Active >, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, - boost::statechart::custom_reaction, - boost::statechart::custom_reaction - > reactions; - explicit Clean(my_context ctx); - void exit(); - boost::statechart::result react(const boost::statechart::event_base&) { - return discard_event(); - } - }; - - struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< GoClean, Clean >, - boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, - boost::statechart::custom_reaction< AllReplicasActivated > - > reactions; - explicit Recovered(my_context ctx); - void exit(); - boost::statechart::result react(const AllReplicasActivated&) { - post_event(GoClean()); - return forward_event(); - } - }; - - struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { - typedef boost::mpl::list< - boost::statechart::custom_reaction< Backfilled >, - boost::statechart::custom_reaction< DeferBackfill >, - boost::statechart::custom_reaction< UnfoundBackfill >, - boost::statechart::custom_reaction< RemoteReservationRejected >, - boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, - boost::statechart::custom_reaction< RemoteReservationRevoked> - > reactions; - explicit Backfilling(my_context ctx); - boost::statechart::result react(const RemoteReservationRejected& evt) { - // for compat with old peers - post_event(RemoteReservationRevokedTooFull()); - return discard_event(); - } - void backfill_release_reservations(); - boost::statechart::result react(const Backfilled& evt); - boost::statechart::result react(const RemoteReservationRevokedTooFull& evt); - boost::statechart::result react(const RemoteReservationRevoked& evt); - boost::statechart::result react(const DeferBackfill& evt); - boost::statechart::result react(const UnfoundBackfill& evt); - void cancel_backfill(); - void exit(); - }; - - struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { - typedef boost::mpl::list< - boost::statechart::custom_reaction< RemoteBackfillReserved >, - boost::statechart::custom_reaction< RemoteReservationRejected >, - boost::statechart::custom_reaction< RemoteReservationRevoked >, - boost::statechart::transition< AllBackfillsReserved, Backfilling > - > reactions; - set::const_iterator backfill_osd_it; - explicit WaitRemoteBackfillReserved(my_context ctx); - void retry(); - void exit(); - boost::statechart::result react(const RemoteBackfillReserved& evt); - boost::statechart::result react(const RemoteReservationRejected& evt); - boost::statechart::result react(const RemoteReservationRevoked& evt); - }; - - struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved > - > reactions; - explicit WaitLocalBackfillReserved(my_context ctx); - void exit(); - }; - - struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, - boost::statechart::custom_reaction< RemoteBackfillReserved >, - boost::statechart::custom_reaction< RemoteReservationRejected > - > reactions; - explicit NotBackfilling(my_context ctx); - void exit(); - boost::statechart::result react(const RemoteBackfillReserved& evt); - boost::statechart::result react(const RemoteReservationRejected& evt); - }; - - struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, - boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< UnfoundRecovery > - > reactions; - explicit NotRecovering(my_context ctx); - boost::statechart::result react(const DeferRecovery& evt) { - /* no-op */ - return discard_event(); - } - boost::statechart::result react(const UnfoundRecovery& evt) { - /* no-op */ - return discard_event(); - } - void exit(); - }; - - struct ToDelete; - struct RepNotRecovering; - struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { - explicit ReplicaActive(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< MQuery >, - boost::statechart::custom_reaction< MInfoRec >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::custom_reaction< MTrim >, - boost::statechart::custom_reaction< Activate >, - boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< DeferBackfill >, - boost::statechart::custom_reaction< UnfoundRecovery >, - boost::statechart::custom_reaction< UnfoundBackfill >, - boost::statechart::custom_reaction< RemoteBackfillPreempted >, - boost::statechart::custom_reaction< RemoteRecoveryPreempted >, - boost::statechart::custom_reaction< RecoveryDone >, - boost::statechart::transition - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const MInfoRec& infoevt); - boost::statechart::result react(const MLogRec& logevt); - boost::statechart::result react(const MTrim& trimevt); - boost::statechart::result react(const ActMap&); - boost::statechart::result react(const MQuery&); - boost::statechart::result react(const Activate&); - boost::statechart::result react(const RecoveryDone&) { - return discard_event(); - } - boost::statechart::result react(const DeferRecovery& evt) { - return discard_event(); - } - boost::statechart::result react(const DeferBackfill& evt) { - return discard_event(); - } - boost::statechart::result react(const UnfoundRecovery& evt) { - return discard_event(); - } - boost::statechart::result react(const UnfoundBackfill& evt) { - return discard_event(); - } - boost::statechart::result react(const RemoteBackfillPreempted& evt) { - return discard_event(); - } - boost::statechart::result react(const RemoteRecoveryPreempted& evt) { - return discard_event(); - } - }; - - struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { - typedef boost::mpl::list< - boost::statechart::transition< RecoveryDone, RepNotRecovering >, - // for compat with old peers - boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, - boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, - boost::statechart::custom_reaction< BackfillTooFull >, - boost::statechart::custom_reaction< RemoteRecoveryPreempted >, - boost::statechart::custom_reaction< RemoteBackfillPreempted > - > reactions; - explicit RepRecovering(my_context ctx); - boost::statechart::result react(const RemoteRecoveryPreempted &evt); - boost::statechart::result react(const BackfillTooFull &evt); - boost::statechart::result react(const RemoteBackfillPreempted &evt); - void exit(); - }; - - struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { - typedef boost::mpl::list< - boost::statechart::custom_reaction< RemoteBackfillReserved >, - boost::statechart::custom_reaction< RejectRemoteReservation >, - boost::statechart::custom_reaction< RemoteReservationRejected >, - boost::statechart::custom_reaction< RemoteReservationCanceled > - > reactions; - explicit RepWaitBackfillReserved(my_context ctx); - void exit(); - boost::statechart::result react(const RemoteBackfillReserved &evt); - boost::statechart::result react(const RejectRemoteReservation &evt); - boost::statechart::result react(const RemoteReservationRejected &evt); - boost::statechart::result react(const RemoteReservationCanceled &evt); - }; - - struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { - typedef boost::mpl::list< - boost::statechart::custom_reaction< RemoteRecoveryReserved >, - // for compat with old peers - boost::statechart::custom_reaction< RemoteReservationRejected >, - boost::statechart::custom_reaction< RemoteReservationCanceled > - > reactions; - explicit RepWaitRecoveryReserved(my_context ctx); - void exit(); - boost::statechart::result react(const RemoteRecoveryReserved &evt); - boost::statechart::result react(const RemoteReservationRejected &evt) { - // for compat with old peers - post_event(RemoteReservationCanceled()); - return discard_event(); - } - boost::statechart::result react(const RemoteReservationCanceled &evt); - }; - - struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { - typedef boost::mpl::list< - boost::statechart::custom_reaction< RequestRecoveryPrio >, - boost::statechart::custom_reaction< RequestBackfillPrio >, - boost::statechart::custom_reaction< RejectRemoteReservation >, - boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, - boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, - boost::statechart::custom_reaction< RemoteRecoveryReserved >, - boost::statechart::custom_reaction< RemoteBackfillReserved >, - boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers - > reactions; - explicit RepNotRecovering(my_context ctx); - boost::statechart::result react(const RequestRecoveryPrio &evt); - boost::statechart::result react(const RequestBackfillPrio &evt); - boost::statechart::result react(const RemoteBackfillReserved &evt) { - // my reservation completion raced with a RELEASE from primary - return discard_event(); - } - boost::statechart::result react(const RemoteRecoveryReserved &evt) { - // my reservation completion raced with a RELEASE from primary - return discard_event(); - } - boost::statechart::result react(const RejectRemoteReservation &evt); - void exit(); - }; - - struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { - typedef boost::mpl::list < - boost::statechart::custom_reaction< AllReplicasRecovered >, - boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< UnfoundRecovery >, - boost::statechart::custom_reaction< RequestBackfill > - > reactions; - explicit Recovering(my_context ctx); - void exit(); - void release_reservations(bool cancel = false); - boost::statechart::result react(const AllReplicasRecovered &evt); - boost::statechart::result react(const DeferRecovery& evt); - boost::statechart::result react(const UnfoundRecovery& evt); - boost::statechart::result react(const RequestBackfill &evt); - }; - - struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { - typedef boost::mpl::list < - boost::statechart::custom_reaction< RemoteRecoveryReserved >, - boost::statechart::transition< AllRemotesReserved, Recovering > - > reactions; - set::const_iterator remote_recovery_reservation_it; - explicit WaitRemoteRecoveryReserved(my_context ctx); - boost::statechart::result react(const RemoteRecoveryReserved &evt); - void exit(); - }; - - struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { - typedef boost::mpl::list < - boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, - boost::statechart::custom_reaction< RecoveryTooFull > - > reactions; - explicit WaitLocalRecoveryReserved(my_context ctx); - void exit(); - boost::statechart::result react(const RecoveryTooFull &evt); - }; - - struct Activating : boost::statechart::state< Activating, Active >, NamedState { - typedef boost::mpl::list < - boost::statechart::transition< AllReplicasRecovered, Recovered >, - boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, - boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > - > reactions; - explicit Activating(my_context ctx); - void exit(); - }; - - struct Stray : boost::statechart::state< Stray, Started >, - NamedState { - explicit Stray(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< MQuery >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::custom_reaction< MInfoRec >, - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< RecoveryDone >, - boost::statechart::transition - > reactions; - boost::statechart::result react(const MQuery& query); - boost::statechart::result react(const MLogRec& logevt); - boost::statechart::result react(const MInfoRec& infoevt); - boost::statechart::result react(const ActMap&); - boost::statechart::result react(const RecoveryDone&) { - return discard_event(); - } - }; - - struct WaitDeleteReserved; - struct ToDelete : boost::statechart::state, NamedState { - unsigned priority = 0; - typedef boost::mpl::list < - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< DeleteSome > - > reactions; - explicit ToDelete(my_context ctx); - boost::statechart::result react(const ActMap &evt); - boost::statechart::result react(const DeleteSome &evt) { - // happens if we drop out of Deleting due to reprioritization etc. - return discard_event(); - } - void exit(); - }; - - struct Deleting; - struct WaitDeleteReserved : boost::statechart::state, NamedState { - typedef boost::mpl::list < - boost::statechart::transition - > reactions; - explicit WaitDeleteReserved(my_context ctx); - void exit(); - }; - - struct Deleting : boost::statechart::state, NamedState { - typedef boost::mpl::list < - boost::statechart::custom_reaction< DeleteSome >, - boost::statechart::transition - > reactions; - explicit Deleting(my_context ctx); - boost::statechart::result react(const DeleteSome &evt); - void exit(); - }; - - struct GetLog; - - struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { - set peer_info_requested; - - explicit GetInfo(my_context ctx); - void exit(); - void get_infos(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::transition< GotInfo, GetLog >, - boost::statechart::custom_reaction< MNotifyRec >, - boost::statechart::transition< IsDown, Down > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const MNotifyRec& infoevt); - }; - - struct GotLog : boost::statechart::event< GotLog > { - GotLog() : boost::statechart::event< GotLog >() {} - }; - - struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { - pg_shard_t auth_log_shard; - boost::intrusive_ptr msg; - - explicit GetLog(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::custom_reaction< GotLog >, - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::transition< IsIncomplete, Incomplete > - > reactions; - boost::statechart::result react(const AdvMap&); - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const MLogRec& logevt); - boost::statechart::result react(const GotLog&); - }; - - struct WaitUpThru; - - struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { - set peer_missing_requested; - - explicit GetMissing(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< MLogRec >, - boost::statechart::transition< NeedUpThru, WaitUpThru > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const MLogRec& logevt); - }; - - struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { - explicit WaitUpThru(my_context ctx); - void exit(); - - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< ActMap >, - boost::statechart::custom_reaction< MLogRec > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const ActMap& am); - boost::statechart::result react(const MLogRec& logrec); - }; - - struct Down : boost::statechart::state< Down, Peering>, NamedState { - explicit Down(my_context ctx); - typedef boost::mpl::list < - boost::statechart::custom_reaction< QueryState >, - boost::statechart::custom_reaction< MNotifyRec > - > reactions; - boost::statechart::result react(const QueryState& q); - boost::statechart::result react(const MNotifyRec& infoevt); - void exit(); - }; - - struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { - typedef boost::mpl::list < - boost::statechart::custom_reaction< AdvMap >, - boost::statechart::custom_reaction< MNotifyRec >, - boost::statechart::custom_reaction< QueryState > - > reactions; - explicit Incomplete(my_context ctx); - boost::statechart::result react(const AdvMap &advmap); - boost::statechart::result react(const MNotifyRec& infoevt); - boost::statechart::result react(const QueryState& q); - void exit(); - }; - - PeeringMachine machine; - PG *pg; - - /// context passed in by state machine caller - RecoveryCtx *orig_ctx; - - /// populated if we are buffering messages pending a flush - boost::optional messages_pending_flush; - - /** - * populated between start_handle() and end_handle(), points into - * the message lists for messages_pending_flush while blocking messages - * or into orig_ctx otherwise - */ - boost::optional rctx; - - public: - explicit PeeringState(PG *pg) - : machine(this, pg), pg(pg), orig_ctx(0) { - machine.initiate(); - } - - void handle_event(const boost::statechart::event_base &evt, - RecoveryCtx *rctx) { - start_handle(rctx); - machine.process_event(evt); - end_handle(); - } - - void handle_event(PGPeeringEventRef evt, - RecoveryCtx *rctx) { - start_handle(rctx); - machine.process_event(evt->get_event()); - end_handle(); - } - - } recovery_state; - - + PeeringState recovery_state; uint64_t peer_features; uint64_t acting_features; diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc new file mode 100644 index 00000000000..ba4579a16c1 --- /dev/null +++ b/src/osd/PeeringState.cc @@ -0,0 +1,2797 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "PGPeeringEvent.h" +#include "common/dout.h" +#include "PeeringState.h" +#include "PG.h" +#include "OSD.h" + +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" +#include "messages/MOSDScrubReserve.h" + +void PeeringState::PeeringMachine::send_query( + pg_shard_t to, const pg_query_t &query) { + ceph_assert(state->rctx); + ceph_assert(state->rctx->query_map); + (*state->rctx->query_map)[to.osd][ + spg_t(pg->info.pgid.pgid, to.shard)] = query; +} + + +/*------------ Peering State Machine----------------*/ +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix (context< PeeringMachine >().pg->gen_prefix(*_dout) \ + << "state<" << get_state_name() << ">: ") + +/*------NamedState----*/ +PeeringState::NamedState::NamedState( + PG *pg_, const char *state_name_) + : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) { + pg->pgstate_history.enter(pg, enter_time, state_name); +} + +PeeringState::NamedState::~NamedState() { + pg->pgstate_history.exit(state_name); +} + +/*------Crashed-------*/ +PeeringState::Crashed::Crashed(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Crashed") +{ + context< PeeringMachine >().log_enter(state_name); + ceph_abort_msg("we got a bad state machine event"); +} + + +/*------Initial-------*/ +PeeringState::Initial::Initial(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Initial") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify) +{ + PG *pg = context< PeeringMachine >().pg; + pg->proc_replica_info( + notify.from, notify.notify.info, notify.notify.epoch_sent); + pg->set_last_peering_reset(); + return transit< Primary >(); +} + +boost::statechart::result PeeringState::Initial::react(const MInfoRec& i) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!pg->is_primary()); + post_event(i); + return transit< Stray >(); +} + +boost::statechart::result PeeringState::Initial::react(const MLogRec& i) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!pg->is_primary()); + post_event(i); + return transit< Stray >(); +} + +void PeeringState::Initial::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur); +} + +/*------Started-------*/ +PeeringState::Started::Started(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::Started::react(const IntervalFlush&) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; + context< PeeringMachine >().pg->recovery_state.end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PeeringState::Started::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Started advmap" << dendl; + pg->check_full_transition(advmap.lastmap, advmap.osdmap); + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset" + << dendl; + post_event(advmap); + return transit< Reset >(); + } + pg->remove_down_peer_info(advmap.osdmap); + return discard_event(); +} + +boost::statechart::result PeeringState::Started::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +void PeeringState::Started::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_started_latency, dur); +} + +/*--------Reset---------*/ +PeeringState::Reset::Reset(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Reset") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + + pg->flushes_in_progress = 0; + pg->set_last_peering_reset(); +} + +boost::statechart::result +PeeringState::Reset::react(const IntervalFlush&) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl; + context< PeeringMachine >().pg->recovery_state.end_block_outgoing(); + return discard_event(); +} + +boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Reset advmap" << dendl; + + pg->check_full_transition(advmap.lastmap, advmap.osdmap); + + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again" + << dendl; + pg->start_peering_interval( + advmap.lastmap, + advmap.newup, advmap.up_primary, + advmap.newacting, advmap.acting_primary, + context< PeeringMachine >().get_cur_transaction()); + } + pg->remove_down_peer_info(advmap.osdmap); + pg->check_past_interval_bounds(); + return discard_event(); +} + +boost::statechart::result PeeringState::Reset::react(const ActMap&) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< PeeringMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + + pg->update_heartbeat_peers(); + pg->take_waiters(); + + return transit< Started >(); +} + +boost::statechart::result PeeringState::Reset::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return discard_event(); +} + +void PeeringState::Reset::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur); +} + +/*-------Start---------*/ +PeeringState::Start::Start(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Start") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + if (pg->is_primary()) { + ldout(pg->cct, 1) << "transitioning to Primary" << dendl; + post_event(MakePrimary()); + } else { //is_stray + ldout(pg->cct, 1) << "transitioning to Stray" << dendl; + post_event(MakeStray()); + } +} + +void PeeringState::Start::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_start_latency, dur); +} + +/*---------Primary--------*/ +PeeringState::Primary::Primary(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + ceph_assert(pg->want_acting.empty()); + + // set CREATING bit until we have peered for the first time. + if (pg->info.history.last_epoch_started == 0) { + pg->state_set(PG_STATE_CREATING); + // use the history timestamp, which ultimately comes from the + // monitor in the create case. + utime_t t = pg->info.history.last_scrub_stamp; + pg->info.stats.last_fresh = t; + pg->info.stats.last_active = t; + pg->info.stats.last_change = t; + pg->info.stats.last_peered = t; + pg->info.stats.last_clean = t; + pg->info.stats.last_unstale = t; + pg->info.stats.last_undegraded = t; + pg->info.stats.last_fullsized = t; + pg->info.stats.last_scrub_stamp = t; + pg->info.stats.last_deep_scrub_stamp = t; + pg->info.stats.last_clean_scrub_stamp = t; + } +} + +boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; + pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react(const ActMap&) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 7) << "handle ActMap primary" << dendl; + pg->publish_stats_to_osd(); + pg->take_waiters(); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const SetForceRecovery&) +{ + PG *pg = context< PeeringMachine >().pg; + pg->set_force_recovery(true); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const UnsetForceRecovery&) +{ + PG *pg = context< PeeringMachine >().pg; + pg->set_force_recovery(false); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const RequestScrub& evt) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->is_primary()) { + pg->unreg_next_scrub(); + pg->scrubber.must_scrub = true; + pg->scrubber.must_deep_scrub = evt.deep || evt.repair; + pg->scrubber.must_repair = evt.repair; + pg->reg_next_scrub(); + ldout(pg->cct,10) << "marking for scrub" << dendl; + } + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const SetForceBackfill&) +{ + PG *pg = context< PeeringMachine >().pg; + pg->set_force_backfill(true); + return discard_event(); +} + +boost::statechart::result PeeringState::Primary::react( + const UnsetForceBackfill&) +{ + PG *pg = context< PeeringMachine >().pg; + pg->set_force_backfill(false); + return discard_event(); +} + +void PeeringState::Primary::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->want_acting.clear(); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur); + pg->clear_primary_state(); + pg->state_clear(PG_STATE_CREATING); +} + +/*---------Peering--------*/ +PeeringState::Peering::Peering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering"), + history_les_bound(false) +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!pg->is_peered()); + ceph_assert(!pg->is_peering()); + ceph_assert(pg->is_primary()); + pg->state_set(PG_STATE_PEERING); +} + +boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Peering advmap" << dendl; + if (prior_set.affected_by_map(*(advmap.osdmap), pg)) { + ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + pg->adjust_need_up_thru(advmap.osdmap); + + return forward_event(); +} + +boost::statechart::result PeeringState::Peering::react(const QueryState& q) +{ + PG *pg = context< PeeringMachine >().pg; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("past_intervals"); + pg->past_intervals.dump(q.f); + q.f->close_section(); + + q.f->open_array_section("probing_osds"); + for (set::iterator p = prior_set.probe.begin(); + p != prior_set.probe.end(); + ++p) + q.f->dump_stream("osd") << *p; + q.f->close_section(); + + if (prior_set.pg_down) + q.f->dump_string("blocked", "peering is blocked due to down osds"); + + q.f->open_array_section("down_osds_we_would_probe"); + for (set::iterator p = prior_set.down.begin(); + p != prior_set.down.end(); + ++p) + q.f->dump_int("osd", *p); + q.f->close_section(); + + q.f->open_array_section("peering_blocked_by"); + for (map::iterator p = prior_set.blocked_by.begin(); + p != prior_set.blocked_by.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_int("osd", p->first); + q.f->dump_int("current_lost_at", p->second); + q.f->dump_string("comment", "starting or marking this osd lost may let us proceed"); + q.f->close_section(); + } + q.f->close_section(); + + if (history_les_bound) { + q.f->open_array_section("peering_blocked_by_detail"); + q.f->open_object_section("item"); + q.f->dump_string("detail","peering_blocked_by_history_les_bound"); + q.f->close_section(); + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +void PeeringState::Peering::exit() +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Leaving Peering" << dendl; + context< PeeringMachine >().log_exit(state_name, enter_time); + pg->state_clear(PG_STATE_PEERING); + pg->clear_probe_targets(); + + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur); +} + + +/*------Backfilling-------*/ +PeeringState::Backfilling::Backfilling(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Backfilling") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->backfill_reserved = true; + pg->queue_recovery(); + pg->state_clear(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->state_set(PG_STATE_BACKFILLING); + pg->publish_stats_to_osd(); +} + +void PeeringState::Backfilling::backfill_release_reservations() +{ + PG *pg = context< PeeringMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + for (set::iterator it = pg->backfill_targets.begin(); + it != pg->backfill_targets.end(); + ++it) { + ceph_assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::RELEASE, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } +} + +void PeeringState::Backfilling::cancel_backfill() +{ + PG *pg = context< PeeringMachine >().pg; + backfill_release_reservations(); + if (!pg->waiting_on_backfill.empty()) { + pg->waiting_on_backfill.clear(); + pg->finish_recovery_op(hobject_t::get_max()); + } +} + +boost::statechart::result +PeeringState::Backfilling::react(const Backfilled &c) +{ + backfill_release_reservations(); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const DeferBackfill &c) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + pg->schedule_backfill_retry(c.delay); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const UnfoundBackfill &c) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; + pg->state_set(PG_STATE_BACKFILL_UNFOUND); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &) +{ + PG *pg = context< PeeringMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); + pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); + return transit(); +} + +boost::statechart::result +PeeringState::Backfilling::react(const RemoteReservationRevoked &) +{ + PG *pg = context< PeeringMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + cancel_backfill(); + if (pg->needs_backfill()) { + return transit(); + } else { + // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore + return discard_event(); + } +} + +void PeeringState::Backfilling::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->backfill_reserved = false; + pg->backfill_reserving = false; + pg->state_clear(PG_STATE_BACKFILLING); + pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur); +} + +/*--WaitRemoteBackfillReserved--*/ + +PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"), + backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin()) +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->publish_stats_to_osd(); + post_event(RemoteBackfillReserved()); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + PG *pg = context< PeeringMachine >().pg; + + int64_t num_bytes = pg->info.stats.stats.sum.num_bytes; + ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl; + if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) { + //The primary never backfills itself + ceph_assert(*backfill_osd_it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + backfill_osd_it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::REQUEST, + spg_t(pg->info.pgid.pgid, backfill_osd_it->shard), + pg->get_osdmap_epoch(), + pg->get_backfill_priority(), + num_bytes, + pg->peer_bytes[*backfill_osd_it]), + con.get()); + } + ++backfill_osd_it; + } else { + pg->peer_bytes.clear(); + post_event(AllBackfillsReserved()); + } + return discard_event(); +} + +void PeeringState::WaitRemoteBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur); +} + +void PeeringState::WaitRemoteBackfillReserved::retry() +{ + PG *pg = context< PeeringMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + + // Send CANCEL to all previously acquired reservations + set::const_iterator it, begin, end; + begin = context< Active >().remote_shards_to_reserve_backfill.begin(); + end = context< Active >().remote_shards_to_reserve_backfill.end(); + ceph_assert(begin != end); + for (it = begin; it != backfill_osd_it; ++it) { + //The primary never backfills itself + ceph_assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::RELEASE, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } + + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->state_set(PG_STATE_BACKFILL_TOOFULL); + pg->publish_stats_to_osd(); + + pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt) +{ + retry(); + return transit(); +} + +boost::statechart::result +PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt) +{ + retry(); + return transit(); +} + +/*--WaitLocalBackfillReserved--*/ +PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + LocalBackfillReserved()), + pg->get_backfill_priority(), + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + DeferBackfill(0.0))); + pg->publish_stats_to_osd(); +} + +void PeeringState::WaitLocalBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur); +} + +/*----NotBackfilling------*/ +PeeringState::NotBackfilling::NotBackfilling(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/NotBackfilling") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_REPAIR); + pg->publish_stats_to_osd(); +} + +boost::statechart::result +PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt) +{ + return discard_event(); +} + +boost::statechart::result +PeeringState::NotBackfilling::react(const RemoteReservationRejected &evt) +{ + return discard_event(); +} + +void PeeringState::NotBackfilling::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_BACKFILL_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); +} + +/*----NotRecovering------*/ +PeeringState::NotRecovering::NotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/NotRecovering") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->publish_stats_to_osd(); +} + +void PeeringState::NotRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_RECOVERY_UNFOUND); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); +} + +/*---RepNotRecovering----*/ +PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepNotRecovering") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RejectRemoteReservation &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->reject_reservation(); + post_event(RemoteReservationRejected()); + return discard_event(); +} + +void PeeringState::RepNotRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur); +} + +/*---RepWaitRecoveryReserved--*/ +PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MRecoveryReserve( + MRecoveryReserve::GRANT, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitRecoveryReserved::react( + const RemoteReservationCanceled &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit(); +} + +void PeeringState::RepWaitRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur); +} + +/*-RepWaitBackfillReserved*/ +PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved") +{ + context< PeeringMachine >().log_enter(state_name); +} + +// Compute pending backfill data +static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) +{ + lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB" + << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl; + return std::max((int64_t)0, bf_bytes - local_bytes); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt) +{ + PG *pg = context< PeeringMachine >().pg; + // Use tentative_bacfill_full() to make sure enough + // space is available to handle target bytes from primary. + + // TODO: If we passed num_objects from primary we could account for + // an estimate of the metadata overhead. + + // TODO: If we had compressed_allocated and compressed_original from primary + // we could compute compression ratio and adjust accordingly. + + // XXX: There is no way to get omap overhead and this would only apply + // to whatever possibly different partition that is storing the database. + + // update_osd_stat() from heartbeat will do this on a new + // statfs using pg->primary_num_bytes. + uint64_t pending_adjustment = 0; + int64_t primary_num_bytes = evt.primary_num_bytes; + int64_t local_num_bytes = evt.local_num_bytes; + if (primary_num_bytes) { + // For erasure coded pool overestimate by a full stripe per object + // because we don't know how each objected rounded to the nearest stripe + if (pg->pool.info.is_erasure()) { + primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count(); + local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects; + } + pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes); + ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB" + << " local " << (local_num_bytes >> 10) << "KiB" + << " pending_adjustments " << (pending_adjustment >> 10) << "KiB" + << dendl; + } + // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes + // That's why we don't immediately unlock + Mutex::Locker l(pg->osd->stat_lock); + osd_stat_t cur_stat = pg->osd->osd_stat; + if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 && + (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) { + ldout(pg->cct, 10) << "backfill reservation rejected: failure injection" + << dendl; + post_event(RejectRemoteReservation()); + } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation && + pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) { + ldout(pg->cct, 10) << "backfill reservation rejected: backfill full" + << dendl; + post_event(RejectRemoteReservation()); + } else { + Context *preempt = nullptr; + // Don't reserve space if skipped reservation check, this is used + // to test the other backfill full check AND in case a corruption + // of num_bytes requires ignoring that value and trying the + // backfill anyway. + if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) + pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes); + else + pg->clear_reserved_num_bytes(); + // Use un-ec-adjusted bytes for stats. + pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes; + if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers will interpret preemption as TOOFULL + preempt = new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + RemoteBackfillPreempted()); + } + pg->osd->remote_reserver.request_reservation( + pg->info.pgid, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + RemoteBackfillReserved()), + evt.priority, + preempt); + } + return transit(); +} + +boost::statechart::result +PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt) +{ + PG *pg = context< PeeringMachine >().pg; + + // fall back to a local reckoning of priority of primary doesn't pass one + // (pre-mimic compat) + int prio = evt.priority ? evt.priority : pg->get_recovery_priority(); + + Context *preempt = nullptr; + if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) { + // older peers can't handle this + preempt = new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + RemoteRecoveryPreempted()); + } + + pg->osd->remote_reserver.request_reservation( + pg->info.pgid, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + RemoteRecoveryReserved()), + prio, + preempt); + return transit(); +} + +void PeeringState::RepWaitBackfillReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt) +{ + PG *pg = context< PeeringMachine >().pg; + + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::GRANT, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RejectRemoteReservation &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->reject_reservation(); + post_event(RemoteReservationRejected()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RemoteReservationRejected &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit(); +} + +boost::statechart::result +PeeringState::RepWaitBackfillReserved::react( + const RemoteReservationCanceled &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + return transit(); +} + +/*---RepRecovering-------*/ +PeeringState::RepRecovering::RepRecovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive/RepRecovering") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MRecoveryReserve( + MRecoveryReserve::REVOKE, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const BackfillTooFull &) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::TOOFULL, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +boost::statechart::result +PeeringState::RepRecovering::react(const RemoteBackfillPreempted &) +{ + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->send_message_osd_cluster( + pg->primary.osd, + new MBackfillReserve( + MBackfillReserve::REVOKE, + spg_t(pg->info.pgid.pgid, pg->primary.shard), + pg->get_osdmap_epoch()), + pg->get_osdmap_epoch()); + return discard_event(); +} + +void PeeringState::RepRecovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur); +} + +/*------Activating--------*/ +PeeringState::Activating::Activating(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Activating") +{ + context< PeeringMachine >().log_enter(state_name); +} + +void PeeringState::Activating::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur); +} + +PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + + // Make sure all nodes that part of the recovery aren't full + if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery && + pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) { + post_event(RecoveryTooFull()); + return; + } + + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + pg->state_set(PG_STATE_RECOVERY_WAIT); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + LocalRecoveryReserved()), + pg->get_recovery_priority(), + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + DeferRecovery(0.0))); + pg->publish_stats_to_osd(); +} + +boost::statechart::result +PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->state_set(PG_STATE_RECOVERY_TOOFULL); + pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval); + return transit(); +} + +void PeeringState::WaitLocalRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur); +} + +PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"), + remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin()) +{ + context< PeeringMachine >().log_enter(state_name); + post_event(RemoteRecoveryReserved()); +} + +boost::statechart::result +PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) { + PG *pg = context< PeeringMachine >().pg; + + if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) { + ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + remote_recovery_reservation_it->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MRecoveryReserve( + MRecoveryReserve::REQUEST, + spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard), + pg->get_osdmap_epoch(), + pg->get_recovery_priority()), + con.get()); + } + ++remote_recovery_reservation_it; + } else { + post_event(AllRemotesReserved()); + } + return discard_event(); +} + +void PeeringState::WaitRemoteRecoveryReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur); +} + +PeeringState::Recovering::Recovering(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Recovering") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + pg->state_set(PG_STATE_RECOVERING); + ceph_assert(!pg->state_test(PG_STATE_ACTIVATING)); + pg->publish_stats_to_osd(); + pg->queue_recovery(); +} + +void PeeringState::Recovering::release_reservations(bool cancel) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(cancel || !pg->pg_log.get_missing().have_missing()); + + // release remote reservations + for (set::const_iterator i = + context< Active >().remote_shards_to_reserve_recovery.begin(); + i != context< Active >().remote_shards_to_reserve_recovery.end(); + ++i) { + if (*i == pg->pg_whoami) // skip myself + continue; + ConnectionRef con = pg->osd->get_con_osd_cluster( + i->osd, pg->get_osdmap_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MRecoveryReserve( + MRecoveryReserve::RELEASE, + spg_t(pg->info.pgid.pgid, i->shard), + pg->get_osdmap_epoch()), + con.get()); + } + } +} + +boost::statechart::result +PeeringState::Recovering::react(const AllReplicasRecovered &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_FORCED_RECOVERY); + release_reservations(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const RequestBackfill &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_FORCED_RECOVERY); + release_reservations(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + // XXX: Is this needed? + pg->publish_stats_to_osd(); + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const DeferRecovery &evt) +{ + PG *pg = context< PeeringMachine >().pg; + if (!pg->state_test(PG_STATE_RECOVERING)) { + // we may have finished recovery and have an AllReplicasRecovered + // event queued to move us to the next state. + ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl; + return discard_event(); + } + ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl; + pg->state_set(PG_STATE_RECOVERY_WAIT); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + pg->schedule_recovery_retry(evt.delay); + return transit(); +} + +boost::statechart::result +PeeringState::Recovering::react(const UnfoundRecovery &evt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl; + pg->state_set(PG_STATE_RECOVERY_UNFOUND); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + return transit(); +} + +void PeeringState::Recovering::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->state_clear(PG_STATE_RECOVERING); + pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur); +} + +PeeringState::Recovered::Recovered(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Recovered") +{ + pg_shard_t auth_log_shard; + + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + + ceph_assert(!pg->needs_recovery()); + + // if we finished backfill, all acting are active; recheck if + // DEGRADED | UNDERSIZED is appropriate. + ceph_assert(!pg->acting_recovery_backfill.empty()); + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= + pg->acting_recovery_backfill.size()) { + pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY); + pg->publish_stats_to_osd(); + } + + // adjust acting set? (e.g. because backfill completed...) + bool history_les_bound = false; + if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard, + true, &history_les_bound)) { + ceph_assert(pg->want_acting.size()); + } else if (!pg->async_recovery_targets.empty()) { + pg->choose_acting(auth_log_shard, true, &history_les_bound); + } + + if (context< Active >().all_replicas_activated && + pg->async_recovery_targets.empty()) + post_event(GoClean()); +} + +void PeeringState::Recovered::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur); +} + +PeeringState::Clean::Clean(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active/Clean") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + + if (pg->info.last_complete != pg->info.last_update) { + ceph_abort(); + } + Context *c = pg->finish_recovery(); + context< PeeringMachine >().get_cur_transaction()->register_on_commit(c); + + pg->try_mark_clean(); +} + +void PeeringState::Clean::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->state_clear(PG_STATE_CLEAN); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur); +} + +template +set unique_osd_shard_set(const pg_shard_t & skip, const T &in) +{ + set osds_found; + set out; + for (typename T::const_iterator i = in.begin(); + i != in.end(); + ++i) { + if (*i != skip && !osds_found.count(i->osd)) { + osds_found.insert(i->osd); + out.insert(*i); + } + } + return out; +} + +/*---------Active---------*/ +PeeringState::Active::Active(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Active"), + remote_shards_to_reserve_recovery( + unique_osd_shard_set( + context< PeeringMachine >().pg->pg_whoami, + context< PeeringMachine >().pg->acting_recovery_backfill)), + remote_shards_to_reserve_backfill( + unique_osd_shard_set( + context< PeeringMachine >().pg->pg_whoami, + context< PeeringMachine >().pg->backfill_targets)), + all_replicas_activated(false) +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + + ceph_assert(!pg->backfill_reserving); + ceph_assert(!pg->backfill_reserved); + ceph_assert(pg->is_primary()); + ldout(pg->cct, 10) << "In Active, about to call activate" << dendl; + pg->start_flush(context< PeeringMachine >().get_cur_transaction()); + pg->activate(*context< PeeringMachine >().get_cur_transaction(), + pg->get_osdmap_epoch(), + *context< PeeringMachine >().get_query_map(), + context< PeeringMachine >().get_info_map(), + context< PeeringMachine >().get_recovery_ctx()); + + // everyone has to commit/ack before we are truly active + pg->blocked_by.clear(); + for (set::iterator p = pg->acting_recovery_backfill.begin(); + p != pg->acting_recovery_backfill.end(); + ++p) { + if (p->shard != pg->pg_whoami.shard) { + pg->blocked_by.insert(p->shard); + } + } + pg->publish_stats_to_osd(); + ldout(pg->cct, 10) << "Activate Finished" << dendl; +} + +boost::statechart::result PeeringState::Active::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->should_restart_peering( + advmap.up_primary, + advmap.acting_primary, + advmap.newup, + advmap.newacting, + advmap.lastmap, + advmap.osdmap)) { + ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl; + return forward_event(); + } + ldout(pg->cct, 10) << "Active advmap" << dendl; + bool need_publish = false; + + if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) { + const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps(); + auto i = new_removed_snaps.find(pg->info.pgid.pool()); + if (i != new_removed_snaps.end()) { + bool bad = false; + for (auto j : i->second) { + if (pg->snap_trimq.intersects(j.first, j.second)) { + decltype(pg->snap_trimq) added, overlap; + added.insert(j.first, j.second); + overlap.intersection_of(pg->snap_trimq, added); + if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { + lderr(pg->cct) << __func__ << " removed_snaps already contains " + << overlap << ", but this is the first mimic+ osdmap," + << " so it's expected" << dendl; + } else { + lderr(pg->cct) << __func__ << " removed_snaps already contains " + << overlap << dendl; + bad = true; + } + pg->snap_trimq.union_of(added); + } else { + pg->snap_trimq.insert(j.first, j.second); + } + } + if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) { + // at upgrade, we report *all* previously removed snaps as removed in + // the first mimic epoch. remove the ones we previously divined were + // removed (and subsequently purged) from the trimq. + lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps" + << " from new removed_snaps" << dendl; + pg->snap_trimq.subtract(pg->info.purged_snaps); + } + ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second + << ", snap_trimq now " << pg->snap_trimq << dendl; + ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); + pg->dirty_info = true; + pg->dirty_big_info = true; + } + + const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps(); + auto j = new_purged_snaps.find(pg->info.pgid.pool()); + if (j != new_purged_snaps.end()) { + bool bad = false; + for (auto k : j->second) { + if (!pg->info.purged_snaps.contains(k.first, k.second)) { + decltype(pg->info.purged_snaps) rm, overlap; + rm.insert(k.first, k.second); + overlap.intersection_of(pg->info.purged_snaps, rm); + lderr(pg->cct) << __func__ << " purged_snaps does not contain " + << rm << ", only " << overlap << dendl; + pg->info.purged_snaps.subtract(overlap); + // This can currently happen in the normal (if unlikely) course of + // events. Because adding snaps to purged_snaps does not increase + // the pg version or add a pg log entry, we don't reliably propagate + // purged_snaps additions to other OSDs. + // One example: + // - purge S + // - primary and replicas update purged_snaps + // - no object updates + // - pg mapping changes, new primary on different node + // - new primary pg version == eversion_t(), so info is not + // propagated. + //bad = true; + } else { + pg->info.purged_snaps.erase(k.first, k.second); + } + } + ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second + << ", now " << pg->info.purged_snaps << dendl; + ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps); + pg->dirty_info = true; + pg->dirty_big_info = true; + } + if (pg->dirty_big_info) { + // share updated purged_snaps to mgr/mon so that we (a) stop reporting + // purged snaps and (b) perhaps share more snaps that we have purged + // but didn't fit in pg_stat_t. + need_publish = true; + pg->share_pg_info(); + } + } else if (!pg->pool.newly_removed_snaps.empty()) { + pg->snap_trimq.union_of(pg->pool.newly_removed_snaps); + ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl; + pg->dirty_info = true; + pg->dirty_big_info = true; + } + + for (size_t i = 0; i < pg->want_acting.size(); i++) { + int osd = pg->want_acting[i]; + if (!advmap.osdmap->is_up(osd)) { + pg_shard_t osd_with_shard(osd, shard_id_t(i)); + ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard)); + } + } + + /* Check for changes in pool size (if the acting set changed as a result, + * this does not matter) */ + if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) != + pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) { + if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) { + pg->state_clear(PG_STATE_UNDERSIZED); + } else { + pg->state_set(PG_STATE_UNDERSIZED); + } + // degraded changes will be detected by call from publish_stats_to_osd() + need_publish = true; + } + + // if we haven't reported our PG stats in a long time, do so now. + if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) { + ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch) + << " epochs" << dendl; + need_publish = true; + } + + if (need_publish) + pg->publish_stats_to_osd(); + + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const ActMap&) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Active: handling ActMap" << dendl; + ceph_assert(pg->is_primary()); + + if (pg->have_unfound()) { + // object may have become unfound + pg->discover_all_missing(*context< PeeringMachine >().get_query_map()); + } + + if (pg->cct->_conf->osd_check_for_log_corruption) + pg->check_log_for_corruption(pg->osd->store); + + uint64_t unfound = pg->missing_loc.num_unfound(); + if (unfound > 0 && + pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) { + if (pg->cct->_conf->osd_auto_mark_unfound_lost) { + pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound + << " objects unfound and apparently lost, would automatically " + << "mark these objects lost but this feature is not yet implemented " + << "(osd_auto_mark_unfound_lost)"; + } else + pg->osd->clog->error() << pg->info.pgid.pgid << " has " + << unfound << " objects unfound and apparently lost"; + } + + if (pg->is_active()) { + ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl; + pg->kick_snap_trim(); + } + + if (pg->is_peered() && + !pg->is_clean() && + !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) && + (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) { + pg->queue_recovery(); + } + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(pg->is_primary()); + if (pg->peer_info.count(notevt.from)) { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", already have info from that osd, ignoring" + << dendl; + } else if (pg->peer_purged.count(notevt.from)) { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", already purged that peer, ignoring" + << dendl; + } else { + ldout(pg->cct, 10) << "Active: got notify from " << notevt.from + << ", calling proc_replica_info and discover_all_missing" + << dendl; + pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent); + if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) { + pg->discover_all_missing(*context< PeeringMachine >().get_query_map()); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MTrim& trim) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(pg->is_primary()); + + // peer is informing us of their last_complete_ondisk + ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl; + pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to; + + // trim log when the pg is recovered + pg->calc_min_last_complete_ondisk(); + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(pg->is_primary()); + + ceph_assert(!pg->acting_recovery_backfill.empty()); + // don't update history (yet) if we are active and primary; the replica + // may be telling us they have activated (and committed) but we can't + // share that until _everyone_ does the same. + if (pg->is_acting_recovery_backfill(infoevt.from) && + pg->peer_activated.count(infoevt.from) == 0) { + ldout(pg->cct, 10) << " peer osd." << infoevt.from + << " activated and committed" << dendl; + pg->peer_activated.insert(infoevt.from); + pg->blocked_by.erase(infoevt.from.shard); + pg->publish_stats_to_osd(); + if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) { + pg->all_activated_and_committed(); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "searching osd." << logevt.from + << " log for unfound items" << dendl; + pg->proc_replica_log( + logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); + bool got_missing = pg->search_for_missing( + pg->peer_info[logevt.from], + pg->peer_missing[logevt.from], + logevt.from, + context< PeeringMachine >().get_recovery_ctx()); + // If there are missing AND we are "fully" active then start recovery now + if (got_missing && pg->state_test(PG_STATE_ACTIVE)) { + post_event(DoRecovery()); + } + return discard_event(); +} + +boost::statechart::result PeeringState::Active::react(const QueryState& q) +{ + PG *pg = context< PeeringMachine >().pg; + + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + { + q.f->open_array_section("might_have_unfound"); + for (set::iterator p = pg->might_have_unfound.begin(); + p != pg->might_have_unfound.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_missing.count(*p)) { + q.f->dump_string("status", "already probed"); + } else if (pg->peer_missing_requested.count(*p)) { + q.f->dump_string("status", "querying"); + } else if (!pg->get_osdmap()->is_up(p->osd)) { + q.f->dump_string("status", "osd is down"); + } else { + q.f->dump_string("status", "not queried"); + } + q.f->close_section(); + } + q.f->close_section(); + } + { + q.f->open_object_section("recovery_progress"); + pg->dump_recovery_info(q.f); + q.f->close_section(); + } + + { + q.f->open_object_section("scrub"); + q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start; + q.f->dump_bool("scrubber.active", pg->scrubber.active); + q.f->dump_string("scrubber.state", PG::Scrubber::state_string(pg->scrubber.state)); + q.f->dump_stream("scrubber.start") << pg->scrubber.start; + q.f->dump_stream("scrubber.end") << pg->scrubber.end; + q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end; + q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update; + q.f->dump_bool("scrubber.deep", pg->scrubber.deep); + { + q.f->open_array_section("scrubber.waiting_on_whom"); + for (set::iterator p = pg->scrubber.waiting_on_whom.begin(); + p != pg->scrubber.waiting_on_whom.end(); + ++p) { + q.f->dump_stream("shard") << *p; + } + q.f->close_section(); + } + q.f->close_section(); + } + + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg_t pgid = pg->info.pgid.pgid; + + all_replicas_activated = true; + + pg->state_clear(PG_STATE_ACTIVATING); + pg->state_clear(PG_STATE_CREATING); + pg->state_clear(PG_STATE_PREMERGE); + + bool merge_target; + if (pg->pool.info.is_pending_merge(pgid, &merge_target)) { + pg->state_set(PG_STATE_PEERED); + pg->state_set(PG_STATE_PREMERGE); + + if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) { + if (merge_target) { + pg_t src = pgid; + src.set_ps(pg->pool.info.get_pg_num_pending()); + assert(src.get_parent() == pgid); + pg->osd->set_not_ready_to_merge_target(pgid, src); + } else { + pg->osd->set_not_ready_to_merge_source(pgid); + } + } + } else if (pg->acting.size() < pg->pool.info.min_size) { + pg->state_set(PG_STATE_PEERED); + } else { + pg->state_set(PG_STATE_ACTIVE); + } + + if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) { + pg->osd->send_pg_created(pgid); + } + + pg->info.history.last_epoch_started = pg->info.last_epoch_started; + pg->info.history.last_interval_started = pg->info.last_interval_started; + pg->dirty_info = true; + + pg->share_pg_info(); + pg->publish_stats_to_osd(); + + pg->check_local(); + + // waiters + if (pg->flushes_in_progress == 0) { + pg->requeue_ops(pg->waiting_for_peered); + } else if (!pg->waiting_for_peered.empty()) { + ldout(pg->cct, 10) << __func__ << " flushes in progress, moving " + << pg->waiting_for_peered.size() + << " items to waiting_for_flush" + << dendl; + ceph_assert(pg->waiting_for_flush.empty()); + pg->waiting_for_flush.swap(pg->waiting_for_peered); + } + + pg->on_activate(); + + return discard_event(); +} + +void PeeringState::Active::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + + pg->blocked_by.clear(); + pg->backfill_reserved = false; + pg->backfill_reserving = false; + pg->state_clear(PG_STATE_ACTIVATING); + pg->state_clear(PG_STATE_DEGRADED); + pg->state_clear(PG_STATE_UNDERSIZED); + pg->state_clear(PG_STATE_BACKFILL_TOOFULL); + pg->state_clear(PG_STATE_BACKFILL_WAIT); + pg->state_clear(PG_STATE_RECOVERY_WAIT); + pg->state_clear(PG_STATE_RECOVERY_TOOFULL); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_active_latency, dur); + pg->agent_stop(); +} + +/*------ReplicaActive-----*/ +PeeringState::ReplicaActive::ReplicaActive(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ReplicaActive") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + pg->start_flush(context< PeeringMachine >().get_cur_transaction()); +} + + +boost::statechart::result PeeringState::ReplicaActive::react( + const Activate& actevt) { + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl; + map > query_map; + pg->activate(*context< PeeringMachine >().get_cur_transaction(), + actevt.activation_epoch, + query_map, NULL, NULL); + ldout(pg->cct, 10) << "Activate Finished" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->proc_primary_info(*context().get_cur_transaction(), + infoevt.info); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "received log from " << logevt.from << dendl; + ObjectStore::Transaction* t = context().get_cur_transaction(); + pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from); + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim) +{ + PG *pg = context< PeeringMachine >().pg; + // primary is instructing us to trim + pg->pg_log.trim(trim.trim_to, pg->info); + pg->dirty_info = true; + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< PeeringMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + pg->take_waiters(); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react( + const MQuery& query) +{ + PG *pg = context< PeeringMachine >().pg; + pg->fulfill_query(query, context().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->close_section(); + return forward_event(); +} + +void PeeringState::ReplicaActive::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->clear_reserved_num_bytes(); + pg->osd->remote_reserver.cancel_reservation(pg->info.pgid); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur); +} + +/*-------Stray---*/ +PeeringState::Stray::Stray(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Stray") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!pg->is_peered()); + ceph_assert(!pg->is_peering()); + ceph_assert(!pg->is_primary()); + + if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) { + ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl; + post_event(DeleteStart()); + } else { + pg->start_flush(context< PeeringMachine >().get_cur_transaction()); + } +} + +boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + MOSDPGLog *msg = logevt.msg.get(); + ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl; + + ObjectStore::Transaction* t = context().get_cur_transaction(); + if (msg->info.last_backfill == hobject_t()) { + // restart backfill + pg->unreg_next_scrub(); + pg->info = msg->info; + pg->reg_next_scrub(); + pg->dirty_info = true; + pg->dirty_big_info = true; // maybe. + + PG::PGLogEntryHandler rollbacker{pg, t}; + pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker); + + pg->pg_log.reset_backfill(); + } else { + pg->merge_log(*t, msg->info, msg->log, logevt.from); + } + + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + post_event(Activate(logevt.msg->info.last_epoch_started)); + return transit(); +} + +boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl; + + if (pg->info.last_update > infoevt.info.last_update) { + // rewind divergent log entries + ObjectStore::Transaction* t = context().get_cur_transaction(); + pg->rewind_divergent_log(*t, infoevt.info.last_update); + pg->info.stats = infoevt.info.stats; + pg->info.hit_set = infoevt.info.hit_set; + } + + ceph_assert(infoevt.info.last_update == pg->info.last_update); + ceph_assert(pg->pg_log.get_head() == pg->info.last_update); + + post_event(Activate(infoevt.info.last_epoch_started)); + return transit(); +} + +boost::statechart::result PeeringState::Stray::react(const MQuery& query) +{ + PG *pg = context< PeeringMachine >().pg; + pg->fulfill_query(query, context().get_recovery_ctx()); + return discard_event(); +} + +boost::statechart::result PeeringState::Stray::react(const ActMap&) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->should_send_notify() && pg->get_primary().osd >= 0) { + context< PeeringMachine >().send_notify( + pg->get_primary(), + pg_notify_t( + pg->get_primary().shard, pg->pg_whoami.shard, + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + pg->info), + pg->past_intervals); + } + pg->take_waiters(); + return discard_event(); +} + +void PeeringState::Stray::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur); +} + + +/*--------ToDelete----------*/ +PeeringState::ToDelete::ToDelete(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ToDelete") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->osd->logger->inc(l_osd_pg_removing); +} + +void PeeringState::ToDelete::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + // note: on a successful removal, this path doesn't execute. see + // _delete_some(). + pg->osd->logger->dec(l_osd_pg_removing); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); +} + +/*----WaitDeleteReserved----*/ +PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, + "Started/ToDelete/WaitDeleteReseved") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + context().priority = pg->get_delete_priority(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + DeleteReserved()), + context().priority, + new PG::QueuePeeringEvt( + pg, pg->get_osdmap_epoch(), + DeleteInterrupted())); +} + +boost::statechart::result PeeringState::ToDelete::react( + const ActMap& evt) +{ + PG *pg = context< PeeringMachine >().pg; + if (pg->get_delete_priority() != priority) { + ldout(pg->cct,10) << __func__ << " delete priority changed, resetting" + << dendl; + return transit(); + } + return discard_event(); +} + +void PeeringState::WaitDeleteReserved::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); +} + +/*----Deleting-----*/ +PeeringState::Deleting::Deleting(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/ToDelete/Deleting") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + pg->deleting = true; + ObjectStore::Transaction* t = context().get_cur_transaction(); + pg->on_removal(t); + t->register_on_commit(new PG::C_DeleteMore(pg, pg->get_osdmap_epoch())); +} + +boost::statechart::result PeeringState::Deleting::react( + const DeleteSome& evt) +{ + PG *pg = context< PeeringMachine >().pg; + pg->_delete_some(context().get_cur_transaction()); + return discard_event(); +} + +void PeeringState::Deleting::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + pg->deleting = false; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); +} + +/*--------GetInfo---------*/ +PeeringState::GetInfo::GetInfo(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/GetInfo") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + pg->check_past_interval_bounds(); + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + ceph_assert(pg->blocked_by.empty()); + + prior_set = pg->build_prior(); + + pg->reset_min_peer_features(); + get_infos(); + if (prior_set.pg_down) { + post_event(IsDown()); + } else if (peer_info_requested.empty()) { + post_event(GotInfo()); + } +} + +void PeeringState::GetInfo::get_infos() +{ + PG *pg = context< PeeringMachine >().pg; + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + + pg->blocked_by.clear(); + for (set::const_iterator it = prior_set.probe.begin(); + it != prior_set.probe.end(); + ++it) { + pg_shard_t peer = *it; + if (peer == pg->pg_whoami) { + continue; + } + if (pg->peer_info.count(peer)) { + ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl; + continue; + } + if (peer_info_requested.count(peer)) { + ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl; + pg->blocked_by.insert(peer.osd); + } else if (!pg->get_osdmap()->is_up(peer.osd)) { + ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl; + } else { + ldout(pg->cct, 10) << " querying info from osd." << peer << dendl; + context< PeeringMachine >().send_query( + peer, pg_query_t(pg_query_t::INFO, + it->shard, pg->pg_whoami.shard, + pg->info.history, + pg->get_osdmap_epoch())); + peer_info_requested.insert(peer); + pg->blocked_by.insert(peer.osd); + } + } + + pg->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt) +{ + PG *pg = context< PeeringMachine >().pg; + + set::iterator p = peer_info_requested.find(infoevt.from); + if (p != peer_info_requested.end()) { + peer_info_requested.erase(p); + pg->blocked_by.erase(infoevt.from.osd); + } + + epoch_t old_start = pg->info.history.last_epoch_started; + if (pg->proc_replica_info( + infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) { + // we got something new ... + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + if (old_start < pg->info.history.last_epoch_started) { + ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl; + prior_set = pg->build_prior(); + + // filter out any osds that got dropped from the probe set from + // peer_info_requested. this is less expensive than restarting + // peering (which would re-probe everyone). + set::iterator p = peer_info_requested.begin(); + while (p != peer_info_requested.end()) { + if (prior_set.probe.count(*p) == 0) { + ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl; + peer_info_requested.erase(p++); + } else { + ++p; + } + } + get_infos(); + } + ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: " + << hex << infoevt.features << dec << dendl; + pg->apply_peer_features(infoevt.features); + + // are we done getting everything? + if (peer_info_requested.empty() && !prior_set.pg_down) { + ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl; + ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl; + ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl; + post_event(GotInfo()); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::GetInfo::react(const QueryState& q) +{ + PG *pg = context< PeeringMachine >().pg; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("requested_info_from"); + for (set::iterator p = peer_info_requested.begin(); + p != peer_info_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_info.count(*p)) { + q.f->open_object_section("got_info"); + pg->peer_info[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +void PeeringState::GetInfo::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur); + pg->blocked_by.clear(); +} + +/*------GetLog------------*/ +PeeringState::GetLog::GetLog(my_context ctx) + : my_base(ctx), + NamedState( + context< PeeringMachine >().pg, "Started/Primary/Peering/GetLog"), + msg(0) +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + + // adjust acting? + if (!pg->choose_acting(auth_log_shard, false, + &context< Peering >().history_les_bound)) { + if (!pg->want_acting.empty()) { + post_event(NeedActingChange()); + } else { + post_event(IsIncomplete()); + } + return; + } + + // am i the best? + if (auth_log_shard == pg->pg_whoami) { + post_event(GotLog()); + return; + } + + const pg_info_t& best = pg->peer_info[auth_log_shard]; + + // am i broken? + if (pg->info.last_update < best.log_tail) { + ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl; + post_event(IsIncomplete()); + return; + } + + // how much log to request? + eversion_t request_log_from = pg->info.last_update; + ceph_assert(!pg->acting_recovery_backfill.empty()); + for (set::iterator p = pg->acting_recovery_backfill.begin(); + p != pg->acting_recovery_backfill.end(); + ++p) { + if (*p == pg->pg_whoami) continue; + pg_info_t& ri = pg->peer_info[*p]; + if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail && + ri.last_update < request_log_from) + request_log_from = ri.last_update; + } + + // how much? + ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl; + context().send_query( + auth_log_shard, + pg_query_t( + pg_query_t::LOG, + auth_log_shard.shard, pg->pg_whoami.shard, + request_log_from, pg->info.history, + pg->get_osdmap_epoch())); + + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(auth_log_shard.osd); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + // make sure our log source didn't go down. we need to check + // explicitly because it may not be part of the prior set, which + // means the Peering state check won't catch it going down. + if (!advmap.osdmap->is_up(auth_log_shard.osd)) { + ldout(pg->cct, 10) << "GetLog: auth_log_shard osd." + << auth_log_shard.osd << " went down" << dendl; + post_event(advmap); + return transit< Reset >(); + } + + // let the Peering state do its checks. + return forward_event(); +} + +boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!msg); + if (logevt.from != auth_log_shard) { + ldout(pg->cct, 10) << "GetLog: discarding log from " + << "non-auth_log_shard osd." << logevt.from << dendl; + return discard_event(); + } + ldout(pg->cct, 10) << "GetLog: received master log from osd." + << logevt.from << dendl; + msg = logevt.msg; + post_event(GotLog()); + return discard_event(); +} + +boost::statechart::result PeeringState::GetLog::react(const GotLog&) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "leaving GetLog" << dendl; + if (msg) { + ldout(pg->cct, 10) << "processing master log" << dendl; + pg->proc_master_log(*context().get_cur_transaction(), + msg->info, msg->log, msg->missing, + auth_log_shard); + } + pg->start_flush(context< PeeringMachine >().get_cur_transaction()); + return transit< GetMissing >(); +} + +boost::statechart::result PeeringState::GetLog::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_stream("auth_log_shard") << auth_log_shard; + q.f->close_section(); + return forward_event(); +} + +void PeeringState::GetLog::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur); + pg->blocked_by.clear(); +} + +/*------WaitActingChange--------*/ +PeeringState::WaitActingChange::WaitActingChange(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/WaitActingChange") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap) +{ + PG *pg = context< PeeringMachine >().pg; + OSDMapRef osdmap = advmap.osdmap; + + ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl; + for (vector::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) { + if (!osdmap->is_up(*p)) { + ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl; + post_event(advmap); + return transit< Reset >(); + } + } + return forward_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for pg acting set to change"); + q.f->close_section(); + return forward_event(); +} + +void PeeringState::WaitActingChange::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur); +} + +/*------Down--------*/ +PeeringState::Down::Down(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/Down") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + + pg->state_clear(PG_STATE_PEERING); + pg->state_set(PG_STATE_DOWN); + + auto &prior_set = context< Peering >().prior_set; + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pg->publish_stats_to_osd(); +} + +void PeeringState::Down::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + + pg->state_clear(PG_STATE_DOWN); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_down_latency, dur); + + pg->blocked_by.clear(); +} + +boost::statechart::result PeeringState::Down::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", + "not enough up instances of this PG to go active"); + q.f->close_section(); + return forward_event(); +} + +boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt) +{ + PG *pg = context< PeeringMachine >().pg; + + ceph_assert(pg->is_primary()); + epoch_t old_start = pg->info.history.last_epoch_started; + if (!pg->peer_info.count(infoevt.from) && + pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) { + pg->update_history(infoevt.notify.info.history); + } + // if we got something new to make pg escape down state + if (pg->info.history.last_epoch_started > old_start) { + ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl; + pg->state_clear(PG_STATE_DOWN); + pg->state_set(PG_STATE_PEERING); + return transit< GetInfo >(); + } + + return discard_event(); +} + + +/*------Incomplete--------*/ +PeeringState::Incomplete::Incomplete(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/Incomplete") +{ + context< PeeringMachine >().log_enter(state_name); + PG *pg = context< PeeringMachine >().pg; + + pg->state_clear(PG_STATE_PEERING); + pg->state_set(PG_STATE_INCOMPLETE); + + PastIntervals::PriorSet &prior_set = context< Peering >().prior_set; + ceph_assert(pg->blocked_by.empty()); + pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end()); + pg->publish_stats_to_osd(); +} + +boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) { + PG *pg = context< PeeringMachine >().pg; + int64_t poolnum = pg->info.pgid.pool(); + + // Reset if min_size turn smaller than previous value, pg might now be able to go active + if (!advmap.osdmap->have_pg_pool(poolnum) || + advmap.lastmap->get_pools().find(poolnum)->second.min_size > + advmap.osdmap->get_pools().find(poolnum)->second.min_size) { + post_event(advmap); + return transit< Reset >(); + } + + return forward_event(); +} + +boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) { + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl; + if (pg->proc_replica_info( + notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) { + // We got something new, try again! + return transit< GetLog >(); + } else { + return discard_event(); + } +} + +boost::statechart::result PeeringState::Incomplete::react( + const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "not enough complete instances of this PG"); + q.f->close_section(); + return forward_event(); +} + +void PeeringState::Incomplete::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + + pg->state_clear(PG_STATE_INCOMPLETE); + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur); + + pg->blocked_by.clear(); +} + +/*------GetMissing--------*/ +PeeringState::GetMissing::GetMissing(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/GetMissing") +{ + context< PeeringMachine >().log_enter(state_name); + + PG *pg = context< PeeringMachine >().pg; + ceph_assert(!pg->acting_recovery_backfill.empty()); + eversion_t since; + for (set::iterator i = pg->acting_recovery_backfill.begin(); + i != pg->acting_recovery_backfill.end(); + ++i) { + if (*i == pg->get_primary()) continue; + const pg_info_t& pi = pg->peer_info[*i]; + // reset this so to make sure the pg_missing_t is initialized and + // has the correct semantics even if we don't need to get a + // missing set from a shard. This way later additions due to + // lost+unfound delete work properly. + pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering(); + + if (pi.is_empty()) + continue; // no pg data, nothing divergent + + if (pi.last_update < pg->pg_log.get_tail()) { + ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + if (pi.last_backfill == hobject_t()) { + ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + + if (pi.last_update == pi.last_complete && // peer has no missing + pi.last_update == pg->info.last_update) { // peer is up to date + // replica has no missing and identical log as us. no need to + // pull anything. + // FIXME: we can do better here. if last_update==last_complete we + // can infer the rest! + ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl; + pg->peer_missing[*i].clear(); + continue; + } + + // We pull the log from the peer's last_epoch_started to ensure we + // get enough log to detect divergent updates. + since.epoch = pi.last_epoch_started; + ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing + if (pi.log_tail <= since) { + ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl; + context< PeeringMachine >().send_query( + *i, + pg_query_t( + pg_query_t::LOG, + i->shard, pg->pg_whoami.shard, + since, pg->info.history, + pg->get_osdmap_epoch())); + } else { + ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i + << " (want since " << since << " < log.tail " + << pi.log_tail << ")" << dendl; + context< PeeringMachine >().send_query( + *i, pg_query_t( + pg_query_t::FULLLOG, + i->shard, pg->pg_whoami.shard, + pg->info.history, pg->get_osdmap_epoch())); + } + peer_missing_requested.insert(*i); + pg->blocked_by.insert(i->osd); + } + + if (peer_missing_requested.empty()) { + if (pg->need_up_thru) { + ldout(pg->cct, 10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + return; + } + + // all good! + post_event(Activate(pg->get_osdmap_epoch())); + } else { + pg->publish_stats_to_osd(); + } +} + +boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + + peer_missing_requested.erase(logevt.from); + pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from); + + if (peer_missing_requested.empty()) { + if (pg->need_up_thru) { + ldout(pg->cct, 10) << " still need up_thru update before going active" + << dendl; + post_event(NeedUpThru()); + } else { + ldout(pg->cct, 10) << "Got last missing, don't need missing " + << "posting Activate" << dendl; + post_event(Activate(pg->get_osdmap_epoch())); + } + } + return discard_event(); +} + +boost::statechart::result PeeringState::GetMissing::react(const QueryState& q) +{ + PG *pg = context< PeeringMachine >().pg; + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + + q.f->open_array_section("peer_missing_requested"); + for (set::iterator p = peer_missing_requested.begin(); + p != peer_missing_requested.end(); + ++p) { + q.f->open_object_section("osd"); + q.f->dump_stream("osd") << *p; + if (pg->peer_missing.count(*p)) { + q.f->open_object_section("got_missing"); + pg->peer_missing[*p].dump(q.f); + q.f->close_section(); + } + q.f->close_section(); + } + q.f->close_section(); + + q.f->close_section(); + return forward_event(); +} + +void PeeringState::GetMissing::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur); + pg->blocked_by.clear(); +} + +/*------WaitUpThru--------*/ +PeeringState::WaitUpThru::WaitUpThru(my_context ctx) + : my_base(ctx), + NamedState(context< PeeringMachine >().pg, "Started/Primary/Peering/WaitUpThru") +{ + context< PeeringMachine >().log_enter(state_name); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am) +{ + PG *pg = context< PeeringMachine >().pg; + if (!pg->need_up_thru) { + post_event(Activate(pg->get_osdmap_epoch())); + } + return forward_event(); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl; + pg->peer_missing[logevt.from].claim(logevt.msg->missing); + pg->peer_info[logevt.from] = logevt.msg->info; + return discard_event(); +} + +boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q) +{ + q.f->open_object_section("state"); + q.f->dump_string("name", state_name); + q.f->dump_stream("enter_time") << enter_time; + q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd"); + q.f->close_section(); + return forward_event(); +} + +void PeeringState::WaitUpThru::exit() +{ + context< PeeringMachine >().log_exit(state_name, enter_time); + PG *pg = context< PeeringMachine >().pg; + utime_t dur = ceph_clock_now() - enter_time; + pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur); +} + +/*----PeeringState::PeeringMachine Methods-----*/ +#undef dout_prefix +#define dout_prefix pg->gen_prefix(*_dout) + +void PeeringState::PeeringMachine::log_enter(const char *state_name) +{ + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 5) << "enter " << state_name << dendl; + pg->osd->pg_recovery_stats.log_enter(state_name); +} + +void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time) +{ + utime_t dur = ceph_clock_now() - enter_time; + PG *pg = context< PeeringMachine >().pg; + ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl; + pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time, + event_count, event_time); + event_count = 0; + event_time = utime_t(); +} + + +/*---------------------------------------------------*/ +#undef dout_prefix +#define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ") + +void PeeringState::start_handle(PeeringCtx *new_ctx) { + ceph_assert(!rctx); + ceph_assert(!orig_ctx); + orig_ctx = new_ctx; + if (new_ctx) { + if (messages_pending_flush) { + rctx = PeeringCtx(*messages_pending_flush, *new_ctx); + } else { + rctx = *new_ctx; + } + rctx->start_time = ceph_clock_now(); + } +} + +void PeeringState::begin_block_outgoing() { + ceph_assert(!messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = BufferedRecoveryMessages(); + rctx = PeeringCtx(*messages_pending_flush, *orig_ctx); +} + +void PeeringState::clear_blocked_outgoing() { + ceph_assert(orig_ctx); + ceph_assert(rctx); + messages_pending_flush = boost::optional(); +} + +void PeeringState::end_block_outgoing() { + ceph_assert(messages_pending_flush); + ceph_assert(orig_ctx); + ceph_assert(rctx); + + rctx = PeeringCtx(*orig_ctx); + rctx->accept_buffered_messages(*messages_pending_flush); + messages_pending_flush = boost::optional(); +} + +void PeeringState::end_handle() { + if (rctx) { + utime_t dur = ceph_clock_now() - rctx->start_time; + machine.event_time += dur; + } + + machine.event_count++; + rctx = boost::optional(); + orig_ctx = NULL; +} diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h new file mode 100644 index 00000000000..ead1967fba5 --- /dev/null +++ b/src/osd/PeeringState.h @@ -0,0 +1,992 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "PGPeeringEvent.h" +#include "os/ObjectStore.h" +#include "OSDMap.h" + +class PG; + + /* Encapsulates PG recovery process */ +class PeeringState { +public: + struct NamedState { + const char *state_name; + utime_t enter_time; + PG* pg; + const char *get_state_name() { return state_name; } + NamedState(PG *pg_, const char *state_name_); + virtual ~NamedState(); + }; + + // [primary only] content recovery state + struct BufferedRecoveryMessages { + map > query_map; + map > > info_map; + map > > notify_list; + }; + + struct PeeringCtx { + utime_t start_time; + map > *query_map; + map > > *info_map; + map > > *notify_list; + ObjectStore::Transaction *transaction; + ThreadPool::TPHandle* handle; + PeeringCtx(map > *query_map, + map > > *info_map, + map > > *notify_list, + ObjectStore::Transaction *transaction) + : query_map(query_map), info_map(info_map), + notify_list(notify_list), + transaction(transaction), + handle(NULL) {} + + PeeringCtx(BufferedRecoveryMessages &buf, PeeringCtx &rctx) + : query_map(&(buf.query_map)), + info_map(&(buf.info_map)), + notify_list(&(buf.notify_list)), + transaction(rctx.transaction), + handle(rctx.handle) {} + + void accept_buffered_messages(BufferedRecoveryMessages &m) { + ceph_assert(query_map); + ceph_assert(info_map); + ceph_assert(notify_list); + for (map >::iterator i = m.query_map.begin(); + i != m.query_map.end(); + ++i) { + map &omap = (*query_map)[i->first]; + for (map::iterator j = i->second.begin(); + j != i->second.end(); + ++j) { + omap[j->first] = j->second; + } + } + for (map > >::iterator i + = m.info_map.begin(); + i != m.info_map.end(); + ++i) { + vector > &ovec = + (*info_map)[i->first]; + ovec.reserve(ovec.size() + i->second.size()); + ovec.insert(ovec.end(), i->second.begin(), i->second.end()); + } + for (map > >::iterator i + = m.notify_list.begin(); + i != m.notify_list.end(); + ++i) { + vector > &ovec = + (*notify_list)[i->first]; + ovec.reserve(ovec.size() + i->second.size()); + ovec.insert(ovec.end(), i->second.begin(), i->second.end()); + } + } + + void send_notify(pg_shard_t to, + const pg_notify_t &info, const PastIntervals &pi) { + ceph_assert(notify_list); + (*notify_list)[to.osd].emplace_back(info, pi); + } + }; + + struct QueryState : boost::statechart::event< QueryState > { + Formatter *f; + explicit QueryState(Formatter *f) : f(f) {} + void print(std::ostream *out) const { + *out << "Query"; + } + }; + + struct AdvMap : boost::statechart::event< AdvMap > { + OSDMapRef osdmap; + OSDMapRef lastmap; + vector newup, newacting; + int up_primary, acting_primary; + AdvMap( + OSDMapRef osdmap, OSDMapRef lastmap, + vector& newup, int up_primary, + vector& newacting, int acting_primary): + osdmap(osdmap), lastmap(lastmap), + newup(newup), + newacting(newacting), + up_primary(up_primary), + acting_primary(acting_primary) {} + void print(std::ostream *out) const { + *out << "AdvMap"; + } + }; + + struct ActMap : boost::statechart::event< ActMap > { + ActMap() : boost::statechart::event< ActMap >() {} + void print(std::ostream *out) const { + *out << "ActMap"; + } + }; + struct Activate : boost::statechart::event< Activate > { + epoch_t activation_epoch; + explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), + activation_epoch(q) {} + void print(std::ostream *out) const { + *out << "Activate from " << activation_epoch; + } + }; +public: + struct UnfoundBackfill : boost::statechart::event { + explicit UnfoundBackfill() {} + void print(std::ostream *out) const { + *out << "UnfoundBackfill"; + } + }; + struct UnfoundRecovery : boost::statechart::event { + explicit UnfoundRecovery() {} + void print(std::ostream *out) const { + *out << "UnfoundRecovery"; + } + }; + + struct RequestScrub : boost::statechart::event { + bool deep; + bool repair; + explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {} + void print(std::ostream *out) const { + *out << "RequestScrub(" << (deep ? "deep" : "shallow") + << (repair ? " repair" : ""); + } + }; + + TrivialEvent(Initialize) + TrivialEvent(GotInfo) + TrivialEvent(NeedUpThru) + TrivialEvent(Backfilled) + TrivialEvent(LocalBackfillReserved) + TrivialEvent(RejectRemoteReservation) + TrivialEvent(RequestBackfill) + TrivialEvent(RemoteRecoveryPreempted) + TrivialEvent(RemoteBackfillPreempted) + TrivialEvent(BackfillTooFull) + TrivialEvent(RecoveryTooFull) + + TrivialEvent(MakePrimary) + TrivialEvent(MakeStray) + TrivialEvent(NeedActingChange) + TrivialEvent(IsIncomplete) + TrivialEvent(IsDown) + + TrivialEvent(AllReplicasRecovered) + TrivialEvent(DoRecovery) + TrivialEvent(LocalRecoveryReserved) + TrivialEvent(AllRemotesReserved) + TrivialEvent(AllBackfillsReserved) + TrivialEvent(GoClean) + + TrivialEvent(AllReplicasActivated) + + TrivialEvent(IntervalFlush) + + TrivialEvent(DeleteStart) + TrivialEvent(DeleteSome) + + TrivialEvent(SetForceRecovery) + TrivialEvent(UnsetForceRecovery) + TrivialEvent(SetForceBackfill) + TrivialEvent(UnsetForceBackfill) + + TrivialEvent(DeleteReserved) + TrivialEvent(DeleteInterrupted) + + void start_handle(PeeringCtx *new_ctx); + void end_handle(); + void begin_block_outgoing(); + void end_block_outgoing(); + void clear_blocked_outgoing(); + private: + + /* States */ + struct Initial; + class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > { + PeeringState *state; + public: + PG *pg; + + utime_t event_time; + uint64_t event_count; + + void clear_event_counters() { + event_time = utime_t(); + event_count = 0; + } + + void log_enter(const char *state_name); + void log_exit(const char *state_name, utime_t duration); + + PeeringMachine(PeeringState *state, PG *pg) : state(state), pg(pg), event_count(0) {} + + /* Accessor functions for state methods */ + ObjectStore::Transaction* get_cur_transaction() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->transaction); + return state->rctx->transaction; + } + + void send_query(pg_shard_t to, const pg_query_t &query); + + map > *get_query_map() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->query_map); + return state->rctx->query_map; + } + + map > > *get_info_map() { + ceph_assert(state->rctx); + ceph_assert(state->rctx->info_map); + return state->rctx->info_map; + } + + PeeringCtx *get_recovery_ctx() { return &*(state->rctx); } + + void send_notify(pg_shard_t to, + const pg_notify_t &info, const PastIntervals &pi) { + ceph_assert(state->rctx); + state->rctx->send_notify(to, info, pi); + } + }; + friend class PeeringMachine; + + /* States */ + // Initial + // Reset + // Start + // Started + // Primary + // WaitActingChange + // Peering + // GetInfo + // GetLog + // GetMissing + // WaitUpThru + // Incomplete + // Active + // Activating + // Clean + // Recovered + // Backfilling + // WaitRemoteBackfillReserved + // WaitLocalBackfillReserved + // NotBackfilling + // NotRecovering + // Recovering + // WaitRemoteRecoveryReserved + // WaitLocalRecoveryReserved + // ReplicaActive + // RepNotRecovering + // RepRecovering + // RepWaitBackfillReserved + // RepWaitRecoveryReserved + // Stray + // ToDelete + // WaitDeleteReserved + // Deleting + // Crashed + + struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState { + explicit Crashed(my_context ctx); + }; + + struct Reset; + + struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState { + explicit Initial(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< Initialize, Reset >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState { + explicit Reset(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction< IntervalFlush >, + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Start; + + struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState { + explicit Started(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< IntervalFlush >, + // ignored + boost::statechart::custom_reaction< NullEvt >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + // crash + boost::statechart::transition< boost::statechart::event_base, Crashed > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const IntervalFlush&); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Primary; + struct Stray; + + struct Start : boost::statechart::state< Start, Started >, NamedState { + explicit Start(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::transition< MakePrimary, Primary >, + boost::statechart::transition< MakeStray, Stray > + > reactions; + }; + + struct Peering; + struct WaitActingChange; + struct Incomplete; + struct Down; + + struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { + explicit Primary(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::transition< NeedActingChange, WaitActingChange >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction + > reactions; + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MNotifyRec&); + boost::statechart::result react(const SetForceRecovery&); + boost::statechart::result react(const UnsetForceRecovery&); + boost::statechart::result react(const SetForceBackfill&); + boost::statechart::result react(const UnsetForceBackfill&); + boost::statechart::result react(const RequestScrub&); + }; + + struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, + NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + explicit WaitActingChange(my_context ctx); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MLogRec&); + boost::statechart::result react(const MInfoRec&); + boost::statechart::result react(const MNotifyRec&); + void exit(); + }; + + struct GetInfo; + struct Active; + + struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { + PastIntervals::PriorSet prior_set; + bool history_les_bound; //< need osd_find_best_info_ignore_history_les + + explicit Peering(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::transition< Activate, Active >, + boost::statechart::custom_reaction< AdvMap > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const AdvMap &advmap); + }; + + struct WaitLocalRecoveryReserved; + struct Activating; + struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { + explicit Active(my_context ctx); + void exit(); + + const set remote_shards_to_reserve_recovery; + const set remote_shards_to_reserve_backfill; + bool all_replicas_activated; + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< AllReplicasActivated >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked>, + boost::statechart::custom_reaction< DoRecovery> + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MNotifyRec& notevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const Backfilled&) { + return discard_event(); + } + boost::statechart::result react(const AllReplicasActivated&); + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevokedTooFull&) { + return discard_event(); + } + boost::statechart::result react(const RemoteReservationRevoked&) { + return discard_event(); + } + boost::statechart::result react(const DoRecovery&) { + return discard_event(); + } + }; + + struct Clean : boost::statechart::state< Clean, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction, + boost::statechart::custom_reaction + > reactions; + explicit Clean(my_context ctx); + void exit(); + boost::statechart::result react(const boost::statechart::event_base&) { + return discard_event(); + } + }; + + struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< GoClean, Clean >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< AllReplicasActivated > + > reactions; + explicit Recovered(my_context ctx); + void exit(); + boost::statechart::result react(const AllReplicasActivated&) { + post_event(GoClean()); + return forward_event(); + } + }; + + struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< Backfilled >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteReservationRejected >, + boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, + boost::statechart::custom_reaction< RemoteReservationRevoked> + > reactions; + explicit Backfilling(my_context ctx); + boost::statechart::result react(const RemoteReservationRejected& evt) { + // for compat with old peers + post_event(RemoteReservationRevokedTooFull()); + return discard_event(); + } + void backfill_release_reservations(); + boost::statechart::result react(const Backfilled& evt); + boost::statechart::result react(const RemoteReservationRevokedTooFull& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + boost::statechart::result react(const DeferBackfill& evt); + boost::statechart::result react(const UnfoundBackfill& evt); + void cancel_backfill(); + void exit(); + }; + + struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejected >, + boost::statechart::custom_reaction< RemoteReservationRevoked >, + boost::statechart::transition< AllBackfillsReserved, Backfilling > + > reactions; + set::const_iterator backfill_osd_it; + explicit WaitRemoteBackfillReserved(my_context ctx); + void retry(); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejected& evt); + boost::statechart::result react(const RemoteReservationRevoked& evt); + }; + + struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved > + > reactions; + explicit WaitLocalBackfillReserved(my_context ctx); + void exit(); + }; + + struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RemoteReservationRejected > + > reactions; + explicit NotBackfilling(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved& evt); + boost::statechart::result react(const RemoteReservationRejected& evt); + }; + + struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery > + > reactions; + explicit NotRecovering(my_context ctx); + boost::statechart::result react(const DeferRecovery& evt) { + /* no-op */ + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + /* no-op */ + return discard_event(); + } + void exit(); + }; + + struct ToDelete; + struct RepNotRecovering; + struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { + explicit ReplicaActive(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MTrim >, + boost::statechart::custom_reaction< Activate >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill >, + boost::statechart::custom_reaction< RemoteBackfillPreempted >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MTrim& trimevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const MQuery&); + boost::statechart::result react(const Activate&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + boost::statechart::result react(const DeferRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const DeferBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteBackfillPreempted& evt) { + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryPreempted& evt) { + return discard_event(); + } + }; + + struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::transition< RecoveryDone, RepNotRecovering >, + // for compat with old peers + boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< BackfillTooFull >, + boost::statechart::custom_reaction< RemoteRecoveryPreempted >, + boost::statechart::custom_reaction< RemoteBackfillPreempted > + > reactions; + explicit RepRecovering(my_context ctx); + boost::statechart::result react(const RemoteRecoveryPreempted &evt); + boost::statechart::result react(const BackfillTooFull &evt); + boost::statechart::result react(const RemoteBackfillPreempted &evt); + void exit(); + }; + + struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::custom_reaction< RejectRemoteReservation >, + boost::statechart::custom_reaction< RemoteReservationRejected >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitBackfillReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteBackfillReserved &evt); + boost::statechart::result react(const RejectRemoteReservation &evt); + boost::statechart::result react(const RemoteReservationRejected &evt); + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + // for compat with old peers + boost::statechart::custom_reaction< RemoteReservationRejected >, + boost::statechart::custom_reaction< RemoteReservationCanceled > + > reactions; + explicit RepWaitRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + boost::statechart::result react(const RemoteReservationRejected &evt) { + // for compat with old peers + post_event(RemoteReservationCanceled()); + return discard_event(); + } + boost::statechart::result react(const RemoteReservationCanceled &evt); + }; + + struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { + typedef boost::mpl::list< + boost::statechart::custom_reaction< RequestRecoveryPrio >, + boost::statechart::custom_reaction< RequestBackfillPrio >, + boost::statechart::custom_reaction< RejectRemoteReservation >, + boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, + boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::custom_reaction< RemoteBackfillReserved >, + boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers + > reactions; + explicit RepNotRecovering(my_context ctx); + boost::statechart::result react(const RequestRecoveryPrio &evt); + boost::statechart::result react(const RequestBackfillPrio &evt); + boost::statechart::result react(const RemoteBackfillReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RemoteRecoveryReserved &evt) { + // my reservation completion raced with a RELEASE from primary + return discard_event(); + } + boost::statechart::result react(const RejectRemoteReservation &evt); + void exit(); + }; + + struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AllReplicasRecovered >, + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< RequestBackfill > + > reactions; + explicit Recovering(my_context ctx); + void exit(); + void release_reservations(bool cancel = false); + boost::statechart::result react(const AllReplicasRecovered &evt); + boost::statechart::result react(const DeferRecovery& evt); + boost::statechart::result react(const UnfoundRecovery& evt); + boost::statechart::result react(const RequestBackfill &evt); + }; + + struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< RemoteRecoveryReserved >, + boost::statechart::transition< AllRemotesReserved, Recovering > + > reactions; + set::const_iterator remote_recovery_reservation_it; + explicit WaitRemoteRecoveryReserved(my_context ctx); + boost::statechart::result react(const RemoteRecoveryReserved &evt); + void exit(); + }; + + struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, + boost::statechart::custom_reaction< RecoveryTooFull > + > reactions; + explicit WaitLocalRecoveryReserved(my_context ctx); + void exit(); + boost::statechart::result react(const RecoveryTooFull &evt); + }; + + struct Activating : boost::statechart::state< Activating, Active >, NamedState { + typedef boost::mpl::list < + boost::statechart::transition< AllReplicasRecovered, Recovered >, + boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, + boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > + > reactions; + explicit Activating(my_context ctx); + void exit(); + }; + + struct Stray : boost::statechart::state< Stray, Started >, + NamedState { + explicit Stray(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< MQuery >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< MInfoRec >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< RecoveryDone >, + boost::statechart::transition + > reactions; + boost::statechart::result react(const MQuery& query); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const MInfoRec& infoevt); + boost::statechart::result react(const ActMap&); + boost::statechart::result react(const RecoveryDone&) { + return discard_event(); + } + }; + + struct WaitDeleteReserved; + struct ToDelete : boost::statechart::state, NamedState { + unsigned priority = 0; + typedef boost::mpl::list < + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< DeleteSome > + > reactions; + explicit ToDelete(my_context ctx); + boost::statechart::result react(const ActMap &evt); + boost::statechart::result react(const DeleteSome &evt) { + // happens if we drop out of Deleting due to reprioritization etc. + return discard_event(); + } + void exit(); + }; + + struct Deleting; + struct WaitDeleteReserved : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::transition + > reactions; + explicit WaitDeleteReserved(my_context ctx); + void exit(); + }; + + struct Deleting : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DeleteSome >, + boost::statechart::transition + > reactions; + explicit Deleting(my_context ctx); + boost::statechart::result react(const DeleteSome &evt); + void exit(); + }; + + struct GetLog; + + struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { + set peer_info_requested; + + explicit GetInfo(my_context ctx); + void exit(); + void get_infos(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::transition< GotInfo, GetLog >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::transition< IsDown, Down > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MNotifyRec& infoevt); + }; + + struct GotLog : boost::statechart::event< GotLog > { + GotLog() : boost::statechart::event< GotLog >() {} + }; + + struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { + pg_shard_t auth_log_shard; + boost::intrusive_ptr msg; + + explicit GetLog(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::custom_reaction< GotLog >, + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::transition< IsIncomplete, Incomplete > + > reactions; + boost::statechart::result react(const AdvMap&); + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MLogRec& logevt); + boost::statechart::result react(const GotLog&); + }; + + struct WaitUpThru; + + struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { + set peer_missing_requested; + + explicit GetMissing(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MLogRec >, + boost::statechart::transition< NeedUpThru, WaitUpThru > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MLogRec& logevt); + }; + + struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { + explicit WaitUpThru(my_context ctx); + void exit(); + + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< ActMap >, + boost::statechart::custom_reaction< MLogRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const ActMap& am); + boost::statechart::result react(const MLogRec& logrec); + }; + + struct Down : boost::statechart::state< Down, Peering>, NamedState { + explicit Down(my_context ctx); + typedef boost::mpl::list < + boost::statechart::custom_reaction< QueryState >, + boost::statechart::custom_reaction< MNotifyRec > + > reactions; + boost::statechart::result react(const QueryState& q); + boost::statechart::result react(const MNotifyRec& infoevt); + void exit(); + }; + + struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< AdvMap >, + boost::statechart::custom_reaction< MNotifyRec >, + boost::statechart::custom_reaction< QueryState > + > reactions; + explicit Incomplete(my_context ctx); + boost::statechart::result react(const AdvMap &advmap); + boost::statechart::result react(const MNotifyRec& infoevt); + boost::statechart::result react(const QueryState& q); + void exit(); + }; + + PeeringMachine machine; + PG *pg; + + /// context passed in by state machine caller + PeeringCtx *orig_ctx; + + /// populated if we are buffering messages pending a flush + boost::optional messages_pending_flush; + + /** + * populated between start_handle() and end_handle(), points into + * the message lists for messages_pending_flush while blocking messages + * or into orig_ctx otherwise + */ + boost::optional rctx; + +public: + explicit PeeringState(PG *pg) + : machine(this, pg), pg(pg), orig_ctx(0) { + machine.initiate(); + } + + void handle_event(const boost::statechart::event_base &evt, + PeeringCtx *rctx) { + start_handle(rctx); + machine.process_event(evt); + end_handle(); + } + + void handle_event(PGPeeringEventRef evt, + PeeringCtx *rctx) { + start_handle(rctx); + machine.process_event(evt->get_event()); + end_handle(); + } + +}; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 18b9e2539ab..cb437eb1ae8 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -4223,7 +4223,7 @@ void PrimaryLogPG::do_scan( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - BackfillTooFull()))); + PeeringState::BackfillTooFull()))); return; } @@ -11957,14 +11957,14 @@ void PrimaryLogPG::mark_all_unfound_lost( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - DoRecovery()))); + PeeringState::DoRecovery()))); } else if (is_backfill_unfound()) { queue_peering_event( PGPeeringEventRef( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - RequestBackfill()))); + PeeringState::RequestBackfill()))); } else { queue_recovery(); } @@ -12152,7 +12152,7 @@ void PrimaryLogPG::on_activate() std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - DoRecovery()))); + PeeringState::DoRecovery()))); } else if (needs_backfill()) { dout(10) << "activate queueing backfill" << dendl; queue_peering_event( @@ -12160,7 +12160,7 @@ void PrimaryLogPG::on_activate() std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - RequestBackfill()))); + PeeringState::RequestBackfill()))); } else { dout(10) << "activate all replicas clean, no recovery" << dendl; eio_errors_to_process = false; @@ -12169,7 +12169,7 @@ void PrimaryLogPG::on_activate() std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - AllReplicasRecovered()))); + PeeringState::AllReplicasRecovered()))); } publish_stats_to_osd(); @@ -12502,7 +12502,7 @@ bool PrimaryLogPG::start_recovery_ops( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - RequestBackfill()))); + PeeringState::RequestBackfill()))); } deferred_backfill = true; } else { @@ -12557,7 +12557,7 @@ bool PrimaryLogPG::start_recovery_ops( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - RequestBackfill()))); + PeeringState::RequestBackfill()))); } else { dout(10) << "recovery done, no backfill" << dendl; eio_errors_to_process = false; @@ -12567,7 +12567,7 @@ bool PrimaryLogPG::start_recovery_ops( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - AllReplicasRecovered()))); + PeeringState::AllReplicasRecovered()))); } } else { // backfilling state_clear(PG_STATE_BACKFILLING); @@ -12580,7 +12580,7 @@ bool PrimaryLogPG::start_recovery_ops( std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - Backfilled()))); + PeeringState::Backfilled()))); } return false; @@ -15253,7 +15253,7 @@ int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ct std::make_shared( get_osdmap_epoch(), get_osdmap_epoch(), - DoRecovery()))); + PeeringState::DoRecovery()))); } else { // A prior error must have already cleared clean state and queued recovery // or a map change has triggered re-peering. diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 9aed6243b1e..14d8eb8c702 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -1589,6 +1589,7 @@ private: } } snap_trimmer_machine; + using NamedState = PeeringState::NamedState; struct WaitReservation; struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState { typedef boost::mpl::list < -- 2.39.5