From: Greg Farnum Date: Thu, 16 Jul 2020 02:12:41 +0000 (+0000) Subject: osd: mon: account for stretch bucket configs/changes when detecting intervals X-Git-Tag: v16.1.0~1053^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=97fbd19ccd02046cb643239ac3c6fcb8404b4445;p=ceph.git osd: mon: account for stretch bucket configs/changes when detecting intervals Factor out the logic we wrote in PeeringState::choose_acting into a new pg_pool_t::stretch_set_can_peer(), and use it in PastIntervals::check_new_interval(). Should have accounted for this when we first set it -- whoops! Set last_force_resend in the OSDMap when we change values, in order to make old clients do the right thing. The OSDs and new clients will detect changes directly by looking at the various crush bucket values in is_new_interval(). Signed-off-by: Greg Farnum --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 02ed3692ffb..a2b1122d5b8 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -14383,6 +14383,7 @@ void OSDMonitor::trigger_degraded_stretch_mode(const set& dead_buckets, newp.peering_crush_bucket_count = new_site_count; newp.peering_crush_mandatory_member = remaining_site; newp.min_size = pgi.second.min_size / 2; // only support 2 zones now + newp.last_force_op_resend = pending_inc.epoch; pending_inc.new_pools[pgi.first] = newp; } } @@ -14403,7 +14404,7 @@ void OSDMonitor::trigger_recovery_stretch_mode() for (auto pgi : osdmap.pools) { if (pgi.second.peering_crush_bucket_count) { pg_pool_t newp(pgi.second); - // bump up the min_size since we have extra replicas available... + newp.last_force_op_resend = pending_inc.epoch; pending_inc.new_pools[pgi.first] = newp; } } @@ -14473,6 +14474,7 @@ void OSDMonitor::trigger_healthy_stretch_mode() newp.peering_crush_bucket_count = osdmap.stretch_bucket_count; newp.peering_crush_mandatory_member = 0; newp.min_size = g_conf().get_val("mon_stretch_pool_min_size"); + newp.last_force_op_resend = pending_inc.epoch; pending_inc.new_pools[pgi.first] = newp; } } diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 2cf993f69b1..93b369d8418 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -2442,22 +2442,9 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, // didn't break them with earlier choices! const pg_pool_t& pg_pool = pool.info; if (pg_pool.is_stretch_pool()) { - const uint32_t barrier_id = pg_pool.peering_crush_bucket_barrier; - const uint32_t barrier_count = pg_pool.peering_crush_bucket_count; - set ancestors; - const shared_ptr& crush = osdmap_ref->crush; - for (int osdid : want) { - int ancestor = crush->get_parent_of_type(osdid, barrier_id, - pg_pool.crush_rule); - ancestors.insert(ancestor); - } - if (ancestors.size() < barrier_count) { - psdout(5) << "peeering blocked: not enough crush buckets with OSDs in acting" << dendl; - return false; - } else if (pg_pool.peering_crush_mandatory_member && - !ancestors.count(pg_pool.peering_crush_mandatory_member)) { - psdout(5) << "peering blocked: missing mandatory crush bucket member " - << pg_pool.peering_crush_mandatory_member << dendl; + stringstream ss; + if (!pg_pool.stretch_set_can_peer(want, *get_osdmap(), &ss)) { + psdout(5) << "peering blocked by stretch_can_peer: " << ss.str() << dendl; return false; } } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index def01335006..8b7c3f088f4 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2237,6 +2237,35 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl) calc_grade_table(); } +bool pg_pool_t::stretch_set_can_peer(const set& want, const OSDMap& osdmap, + std::ostream * out) const +{ + const uint32_t barrier_id = peering_crush_bucket_barrier; + const uint32_t barrier_count = peering_crush_bucket_count; + set ancestors; + const shared_ptr& crush = osdmap.crush; + for (int osdid : want) { + int ancestor = crush->get_parent_of_type(osdid, barrier_id, + crush_rule); + ancestors.insert(ancestor); + } + if (ancestors.size() < barrier_count) { + if (out) { + *out << __func__ << ": not enough crush buckets with OSDs in want set " + << want; + } + return false; + } else if (peering_crush_mandatory_member && + !ancestors.count(peering_crush_mandatory_member)) { + if (out) { + *out << __func__ << ": missing mandatory crush bucket member " + << peering_crush_mandatory_member; + } + return false; + } + return true; +} + void pg_pool_t::generate_test_instances(list& o) { pg_pool_t a; @@ -3951,6 +3980,14 @@ bool PastIntervals::is_new_interval( bool new_sort_bitwise, bool old_recovery_deletes, bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, pg_t pgid) { return old_acting_primary != new_acting_primary || new_acting != old_acting || @@ -3970,7 +4007,11 @@ bool PastIntervals::is_new_interval( // merge target pgid.is_merge_target(old_pg_num, new_pg_num) || old_sort_bitwise != new_sort_bitwise || - old_recovery_deletes != new_recovery_deletes; + old_recovery_deletes != new_recovery_deletes || + old_crush_count != new_crush_count || + old_crush_target != new_crush_target || + old_crush_barrier != new_crush_barrier || + old_crush_member != new_crush_member; } bool PastIntervals::is_new_interval( @@ -4015,6 +4056,10 @@ bool PastIntervals::is_new_interval( osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE), lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES), + plast->peering_crush_bucket_count, pi->peering_crush_bucket_count, + plast->peering_crush_bucket_target, pi->peering_crush_bucket_target, + plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier, + plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member, pgid); } @@ -4117,6 +4162,8 @@ bool PastIntervals::check_new_interval( if (num_acting && i.primary != -1 && num_acting >= old_pg_pool.min_size && + (!old_pg_pool.is_stretch_pool() || + old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) && could_have_gone_active(old_acting_shards)) { if (out) *out << __func__ << " " << i @@ -4168,7 +4215,6 @@ bool PastIntervals::check_new_interval( } } - // true if the given map affects the prior set bool PastIntervals::PriorSet::affected_by_map( const OSDMap &osdmap, diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 18885230e38..9dbee932d26 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1160,6 +1160,8 @@ struct pg_merge_meta_t { }; WRITE_CLASS_ENCODER(pg_merge_meta_t) +class OSDMap; + /* * pg_pool */ @@ -1464,6 +1466,15 @@ public: return peering_crush_bucket_count != 0; } + bool stretch_set_can_peer(const set& want, const OSDMap& osdmap, + std::ostream *out) const; + bool stretch_set_can_peer(const vector& want, const OSDMap& osdmap, + std::ostream *out) const { + set swant; + for (auto i : want) swant.insert(i); + return stretch_set_can_peer(swant, osdmap, out); + } + uint64_t target_max_bytes = 0; ///< tiering: target max pool size uint64_t target_max_objects = 0; ///< tiering: target max pool size @@ -3066,7 +3077,6 @@ struct pg_fast_info_t { WRITE_CLASS_ENCODER(pg_fast_info_t) -class OSDMap; /** * PastIntervals -- information needed to determine the PriorSet and * the might_have_unfound set @@ -3202,6 +3212,14 @@ public: bool new_sort_bitwise, bool old_recovery_deletes, bool new_recovery_deletes, + uint32_t old_crush_count, + uint32_t new_crush_count, + uint32_t old_crush_target, + uint32_t new_crush_target, + uint32_t old_crush_barrier, + uint32_t new_crush_barrier, + int32_t old_crush_member, + int32_t new_crush_member, pg_t pgid ); diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 816d358ad12..0cf592580c3 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2797,6 +2797,14 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) sort_bitwise, t->recovery_deletes, recovery_deletes, + t->peering_crush_bucket_count, + pi->peering_crush_bucket_count, + t->peering_crush_bucket_target, + pi->peering_crush_bucket_target, + t->peering_crush_bucket_barrier, + pi->peering_crush_bucket_barrier, + t->peering_crush_mandatory_member, + pi->peering_crush_mandatory_member, prev_pgid)) { force_resend = true; } @@ -2848,6 +2856,10 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) t->actual_pgid = spgid; t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; + t->peering_crush_bucket_count = pi->peering_crush_bucket_count; + t->peering_crush_bucket_target = pi->peering_crush_bucket_target; + t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier; + t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member; ldout(cct, 10) << __func__ << " " << " raw pgid " << pgid << " -> actual " << t->actual_pgid << " acting " << acting diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 18f17d53ac6..85e08eab547 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1778,6 +1778,10 @@ public: int min_size = -1; ///< the min size of the pool when were were last mapped bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering + uint32_t peering_crush_bucket_count = 0; + uint32_t peering_crush_bucket_target = 0; + uint32_t peering_crush_bucket_barrier = 0; + int32_t peering_crush_mandatory_member = 0; bool used_replica = false; bool paused = false;