From 818b91b7e2af56d9564717a9cef2d353cf42cf3d Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 7 Nov 2014 15:20:02 -0800 Subject: [PATCH] osd/: go "peered" instead of "active" when < min_size In the case of a replicated pool, the pg will transition to "peered" rather than "active", where it can perform backfill bringing itself up to min_size peers. To that end, most operations now require is_peered rather than is_active (OSDOps being the primary exception). Also, rather than using the query_epoch on the activation message as the activation epoch (for last_epoch_started) on the replica, we instead use the last_epoch_started in the info sent by the primary. This allows the primary to not advance last_epoch_started past the last known actual activation. This will prevent later peering epochs from requiring the last_update from a peered epoch to go active (might be divergent). Fixes: #7862 Signed-off-by: Samuel Just --- src/include/ceph_features.h | 2 + src/osd/PG.cc | 96 +++++++++++++++++++++++-------------- src/osd/PG.h | 10 ++-- src/osd/ReplicatedPG.cc | 4 +- 4 files changed, 70 insertions(+), 42 deletions(-) diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index ed2b7ee47010e..2fb51375cfc94 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -60,6 +60,7 @@ #define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ #define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) #define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ +#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) #define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */ #define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */ @@ -144,6 +145,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) { CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT | \ CEPH_FEATURE_MDS_QUOTA | \ CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \ 0ULL) #define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c8b02b0dbfd48..d120b21a98c92 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -270,7 +270,7 @@ void PG::proc_master_log( { dout(10) << "proc_master_log for osd." << from << ": " << olog << " " << omissing << dendl; - assert(!is_active() && is_primary()); + assert(!is_peered() && is_primary()); // merge log into our own log to build master log. no need to // make any adjustments to their missing map; we are taking their @@ -1307,11 +1307,14 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id) ++num_want_acting; } - // This is a bit of a problem, if we allow the pg to go active with - // want.size() < min_size, we won't consider the pg to have been - // maybe_went_rw in build_prior. - if (num_want_acting < pool.info.min_size) { + // We go incomplete if below min_size for ec_pools since backfill + // does not currently maintain rollbackability + // Otherwise, we will go "peered", but not "active" + if (num_want_acting < pool.info.min_size && + (pool.info.ec_pool() || + !(get_min_peer_features() & CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY))) { want_acting.clear(); + dout(10) << "choose_acting failed, below min size" << dendl; return false; } @@ -1328,6 +1331,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id) } if (!(*recoverable_predicate)(have)) { want_acting.clear(); + dout(10) << "choose_acting failed, not recoverable" << dendl; return false; } @@ -1427,15 +1431,16 @@ void PG::build_might_have_unfound() struct C_PG_ActivateCommitted : public Context { PGRef pg; epoch_t epoch; - C_PG_ActivateCommitted(PG *p, epoch_t e) - : pg(p), epoch(e) {} + epoch_t activation_epoch; + C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae) + : pg(p), epoch(e), activation_epoch(ae) {} void finish(int r) { - pg->_activate_committed(epoch); + pg->_activate_committed(epoch, activation_epoch); } }; void PG::activate(ObjectStore::Transaction& t, - epoch_t query_epoch, + epoch_t activation_epoch, list& tfin, map >& query_map, map > > *activator_map, RecoveryCtx *ctx) { - assert(!is_active()); + assert(!is_peered()); assert(scrubber.callbacks.empty()); assert(callbacks_for_degraded_object.empty()); // -- crash recovery? - if (is_primary() && + if (acting.size() >= pool.info.min_size && + is_primary() && pool.info.crash_replay_interval > 0 && may_need_replay(get_osdmap())) { replay_until = ceph_clock_now(cct); @@ -1470,8 +1476,14 @@ void PG::activate(ObjectStore::Transaction& t, send_notify = false; - if (is_acting(pg_whoami)) - info.last_epoch_started = query_epoch; + if (is_primary()) { + // only update primary last_epoch_started if we will go active + if (acting.size() >= pool.info.min_size) + info.last_epoch_started = activation_epoch; + } else if (is_acting(pg_whoami)) { + // update last_epoch_started on acting replica to whatever the primary sent + info.last_epoch_started = activation_epoch; + } const pg_missing_t &missing = pg_log.get_missing(); @@ -1489,7 +1501,11 @@ void PG::activate(ObjectStore::Transaction& t, dirty_big_info = true; // maybe // find out when we commit - t.register_on_complete(new C_PG_ActivateCommitted(this, query_epoch)); + t.register_on_complete( + new C_PG_ActivateCommitted( + this, + get_osdmap()->get_epoch(), + activation_epoch)); // initialize snap_trimq if (is_primary()) { @@ -1573,6 +1589,7 @@ void PG::activate(ObjectStore::Transaction& t, pi.last_update = info.last_update; pi.last_complete = info.last_update; pi.last_backfill = hobject_t(); + pi.last_epoch_started = info.last_epoch_started; pi.history = info.history; pi.hit_set = info.hit_set; pi.stats.stats.clear(); @@ -1802,32 +1819,40 @@ void PG::replay_queued_ops() publish_stats_to_osd(); } -void PG::_activate_committed(epoch_t e) +void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch) { lock(); - if (pg_has_reset_since(e)) { - dout(10) << "_activate_committed " << e << ", that was an old interval" << dendl; + if (pg_has_reset_since(epoch)) { + dout(10) << "_activate_committed " << epoch + << ", that was an old interval" << dendl; } else if (is_primary()) { peer_activated.insert(pg_whoami); - dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated + dout(10) << "_activate_committed " << epoch + << " peer_activated now " << peer_activated << " last_epoch_started " << info.history.last_epoch_started << " same_interval_since " << info.history.same_interval_since << dendl; assert(!actingbackfill.empty()); if (peer_activated.size() == actingbackfill.size()) all_activated_and_committed(); } else { - dout(10) << "_activate_committed " << e << " telling primary" << dendl; - MOSDPGInfo *m = new MOSDPGInfo(e); + dout(10) << "_activate_committed " << epoch << " telling primary" << dendl; + MOSDPGInfo *m = new MOSDPGInfo(epoch); pg_notify_t i = pg_notify_t( get_primary().shard, pg_whoami.shard, get_osdmap()->get_epoch(), get_osdmap()->get_epoch(), info); - i.info.history.last_epoch_started = e; + + i.info.history.last_epoch_started = activation_epoch; + if (acting.size() >= pool.info.min_size) { + state_set(PG_STATE_ACTIVE); + } else { + state_set(PG_STATE_PEERED); + } + m->pg_list.push_back(make_pair(i, pg_interval_map_t())); osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch()); - state_set(PG_STATE_ACTIVE); // waiters if (flushes_in_progress == 0) { requeue_ops(waiting_for_peered); @@ -2264,7 +2289,7 @@ void PG::_update_calc_stats() info.stats.stats.calc_copies(MAX(target, actingbackfill.size())); info.stats.stats.sum.num_objects_degraded = 0; info.stats.stats.sum.num_objects_misplaced = 0; - if ((is_degraded() || is_undersized() || !is_clean()) && is_active()) { + if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) { // NOTE: we only generate copies, degraded, unfound values for // the summation, not individual stat categories. uint64_t num_objects = info.stats.stats.sum.num_objects; @@ -4729,6 +4754,7 @@ void PG::start_peering_interval( // deactivate. state_clear(PG_STATE_ACTIVE); + state_clear(PG_STATE_PEERED); state_clear(PG_STATE_DOWN); state_clear(PG_STATE_RECOVERY_WAIT); state_clear(PG_STATE_RECOVERING); @@ -4860,7 +4886,7 @@ ostream& operator<<(ostream& out, const PG& pg) << "/" << pg.past_intervals.size(); } - if (pg.is_active()) { + if (pg.is_peered()) { if (pg.last_update_ondisk != pg.info.last_update) out << " luod=" << pg.last_update_ondisk; if (pg.last_update_applied != pg.info.last_update) @@ -5569,7 +5595,7 @@ PG::RecoveryState::Peering::Peering(my_context ctx) context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; - assert(!pg->is_active()); + assert(!pg->is_peered()); assert(!pg->is_peering()); assert(pg->is_primary()); pg->state_set(PG_STATE_PEERING); @@ -6410,10 +6436,6 @@ boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoe // may be telling us they have activated (and committed) but we can't // share that until _everyone_ does the same. if (pg->is_actingbackfill(infoevt.from)) { - assert(pg->info.history.last_epoch_started < - pg->info.history.same_interval_since); - assert(infoevt.info.history.last_epoch_started >= - pg->info.history.same_interval_since); dout(10) << " peer osd." << infoevt.from << " activated and committed" << dendl; pg->peer_activated.insert(infoevt.from); @@ -6507,7 +6529,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi pg->state_clear(PG_STATE_ACTIVATING); pg->state_clear(PG_STATE_CREATING); - pg->state_set(PG_STATE_ACTIVE); + if (pg->acting.size() >= pg->pool.info.min_size) { + pg->state_set(PG_STATE_ACTIVE); + } else { + pg->state_set(PG_STATE_PEERED); + } // info.last_epoch_started is set during activate() pg->info.history.last_epoch_started = pg->info.last_epoch_started; @@ -6518,7 +6544,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi pg->check_local(); // waiters - if (!pg->is_replay() && pg->flushes_in_progress == 0) { + if (pg->flushes_in_progress == 0) { pg->requeue_ops(pg->waiting_for_peered); } @@ -6569,7 +6595,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react( PG *pg = context< RecoveryMachine >().pg; map > query_map; pg->activate(*context< RecoveryMachine >().get_cur_transaction(), - actevt.query_epoch, + actevt.activation_epoch, *context< RecoveryMachine >().get_on_safe_context_list(), query_map, NULL, NULL); dout(10) << "Activate Finished" << dendl; @@ -6648,7 +6674,7 @@ PG::RecoveryState::Stray::Stray(my_context ctx) context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; - assert(!pg->is_active()); + assert(!pg->is_peered()); assert(!pg->is_peering()); assert(!pg->is_primary()); pg->start_flush( @@ -6683,7 +6709,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt) assert(pg->pg_log.get_head() == pg->info.last_update); - post_event(Activate(logevt.msg->get_epoch())); + post_event(Activate(logevt.msg->info.last_epoch_started)); return transit(); } @@ -6703,7 +6729,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoev assert(infoevt.info.last_update == pg->info.last_update); assert(pg->pg_log.get_head() == pg->info.last_update); - post_event(Activate(infoevt.msg_epoch)); + post_event(Activate(infoevt.info.last_epoch_started)); return transit(); } diff --git a/src/osd/PG.h b/src/osd/PG.h index 4408b8ca8ba71..df4c156580ab7 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -970,13 +970,13 @@ public: void replay_queued_ops(); void activate( ObjectStore::Transaction& t, - epoch_t query_epoch, + epoch_t activation_epoch, list& tfin, map >& query_map, map > > *activator_map, RecoveryCtx *ctx); - void _activate_committed(epoch_t e); + void _activate_committed(epoch_t epoch, epoch_t activation_epoch); void all_activated_and_committed(); void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); @@ -1369,11 +1369,11 @@ public: } }; struct Activate : boost::statechart::event< Activate > { - epoch_t query_epoch; + epoch_t activation_epoch; Activate(epoch_t q) : boost::statechart::event< Activate >(), - query_epoch(q) {} + activation_epoch(q) {} void print(std::ostream *out) const { - *out << "Activate from " << query_epoch; + *out << "Activate from " << activation_epoch; } }; struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > { diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 67dd16423dbe7..afb53d60cf645 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1284,7 +1284,7 @@ void ReplicatedPG::do_request( } } - assert(is_active() && flushes_in_progress == 0); + assert(is_peered() && flushes_in_progress == 0); if (pgbackend->handle_message(op)) return; @@ -10176,7 +10176,7 @@ void ReplicatedPG::on_flushed() if (flushes_in_progress == 0) { requeue_ops(waiting_for_peered); } - if (!is_active() || !is_primary()) { + if (!is_peered() || !is_primary()) { pair i; while (object_contexts.get_next(i.first, &i)) { derr << "on_flushed: object " << i.first << " obc still alive" << dendl; -- 2.39.5