]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: mon: account for stretch bucket configs/changes when detecting intervals
authorGreg Farnum <gfarnum@redhat.com>
Thu, 16 Jul 2020 02:12:41 +0000 (02:12 +0000)
committerGreg Farnum <gfarnum@redhat.com>
Mon, 20 Jul 2020 07:13:12 +0000 (07:13 +0000)
Factor out the logic we wrote in PeeringState::choose_acting into a new
pg_pool_t::stretch_set_can_peer(), and use it in
PastIntervals::check_new_interval(). Should have accounted for this
when we first set it -- whoops!

Set last_force_resend in the OSDMap when we change values, in order
to make old clients do the right thing. The OSDs and new clients
will detect changes directly by looking at the various crush bucket
values in is_new_interval().

Signed-off-by: Greg Farnum <gfarnum@redhat.com>
src/mon/OSDMonitor.cc
src/osd/PeeringState.cc
src/osd/osd_types.cc
src/osd/osd_types.h
src/osdc/Objecter.cc
src/osdc/Objecter.h

index 02ed3692ffb8de44d199b44ba29d6bab836b36b4..a2b1122d5b8740e4c9308029e5f9938bb9b4c512 100644 (file)
@@ -14383,6 +14383,7 @@ void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
       newp.peering_crush_bucket_count = new_site_count;
       newp.peering_crush_mandatory_member = remaining_site;
       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
+      newp.last_force_op_resend = pending_inc.epoch;
       pending_inc.new_pools[pgi.first] = newp;
     }
   }
@@ -14403,7 +14404,7 @@ void OSDMonitor::trigger_recovery_stretch_mode()
   for (auto pgi : osdmap.pools) {
     if (pgi.second.peering_crush_bucket_count) {
       pg_pool_t newp(pgi.second);
-      // bump up the min_size since we have extra replicas available...
+      newp.last_force_op_resend = pending_inc.epoch;
       pending_inc.new_pools[pgi.first] = newp;
     }
   }
@@ -14473,6 +14474,7 @@ void OSDMonitor::trigger_healthy_stretch_mode()
       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
       newp.peering_crush_mandatory_member = 0;
       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
+      newp.last_force_op_resend = pending_inc.epoch;
       pending_inc.new_pools[pgi.first] = newp;
     }
   }
index 2cf993f69b175aebff3e51add93f525dcc9c8b4f..93b369d8418eb33b71c88a7621606eda6df07fb5 100644 (file)
@@ -2442,22 +2442,9 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
   // didn't break them with earlier choices!
   const pg_pool_t& pg_pool = pool.info;
   if (pg_pool.is_stretch_pool()) {
-    const uint32_t barrier_id = pg_pool.peering_crush_bucket_barrier;
-    const uint32_t barrier_count = pg_pool.peering_crush_bucket_count;
-    set<int> ancestors;
-    const shared_ptr<CrushWrapper>& crush = osdmap_ref->crush;
-    for (int osdid : want) {
-      int ancestor = crush->get_parent_of_type(osdid, barrier_id,
-                                              pg_pool.crush_rule);
-      ancestors.insert(ancestor);
-    }
-    if (ancestors.size() < barrier_count) {
-      psdout(5) << "peeering blocked: not enough crush buckets with OSDs in acting" << dendl;
-      return false;
-    } else if (pg_pool.peering_crush_mandatory_member &&
-              !ancestors.count(pg_pool.peering_crush_mandatory_member)) {
-      psdout(5) << "peering blocked: missing mandatory crush bucket member "
-               << pg_pool.peering_crush_mandatory_member << dendl;
+    stringstream ss;
+    if (!pg_pool.stretch_set_can_peer(want, *get_osdmap(), &ss)) {
+      psdout(5) << "peering blocked by stretch_can_peer: " << ss.str() << dendl;
       return false;
     }
   }
index def01335006e2011f3c9000c28f4e288dc6d4b02..8b7c3f088f4fcb3aff467a3f6b0aaf4b04add9f2 100644 (file)
@@ -2237,6 +2237,35 @@ void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
   calc_grade_table();
 }
 
+bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+                                    std::ostream * out) const
+{
+  const uint32_t barrier_id = peering_crush_bucket_barrier;
+  const uint32_t barrier_count = peering_crush_bucket_count;
+  set<int> ancestors;
+  const shared_ptr<CrushWrapper>& crush = osdmap.crush;
+  for (int osdid : want) {
+    int ancestor = crush->get_parent_of_type(osdid, barrier_id,
+                                            crush_rule);
+    ancestors.insert(ancestor);
+  }
+  if (ancestors.size() < barrier_count) {
+    if (out) {
+      *out << __func__ << ": not enough crush buckets with OSDs in want set "
+          << want;
+    }
+    return false;
+  } else if (peering_crush_mandatory_member &&
+            !ancestors.count(peering_crush_mandatory_member)) {
+    if (out) {
+      *out << __func__ << ": missing mandatory crush bucket member "
+          << peering_crush_mandatory_member;
+    }
+    return false;
+  }
+  return true;
+}
+
 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
 {
   pg_pool_t a;
@@ -3951,6 +3980,14 @@ bool PastIntervals::is_new_interval(
   bool new_sort_bitwise,
   bool old_recovery_deletes,
   bool new_recovery_deletes,
+  uint32_t old_crush_count,
+  uint32_t new_crush_count,
+  uint32_t old_crush_target,
+  uint32_t new_crush_target,
+  uint32_t old_crush_barrier,
+  uint32_t new_crush_barrier,
+  int32_t old_crush_member,
+  int32_t new_crush_member,
   pg_t pgid) {
   return old_acting_primary != new_acting_primary ||
     new_acting != old_acting ||
@@ -3970,7 +4007,11 @@ bool PastIntervals::is_new_interval(
     // merge target
     pgid.is_merge_target(old_pg_num, new_pg_num) ||
     old_sort_bitwise != new_sort_bitwise ||
-    old_recovery_deletes != new_recovery_deletes;
+    old_recovery_deletes != new_recovery_deletes ||
+    old_crush_count != new_crush_count ||
+    old_crush_target != new_crush_target ||
+    old_crush_barrier != new_crush_barrier ||
+    old_crush_member != new_crush_member;
 }
 
 bool PastIntervals::is_new_interval(
@@ -4015,6 +4056,10 @@ bool PastIntervals::is_new_interval(
                    osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
                    lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
                    osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+                   plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
+                   plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
+                   plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
+                   plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
                    pgid);
 }
 
@@ -4117,6 +4162,8 @@ bool PastIntervals::check_new_interval(
     if (num_acting &&
        i.primary != -1 &&
        num_acting >= old_pg_pool.min_size &&
+       (!old_pg_pool.is_stretch_pool() ||
+        old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
         could_have_gone_active(old_acting_shards)) {
       if (out)
        *out << __func__ << " " << i
@@ -4168,7 +4215,6 @@ bool PastIntervals::check_new_interval(
   }
 }
 
-
 // true if the given map affects the prior set
 bool PastIntervals::PriorSet::affected_by_map(
   const OSDMap &osdmap,
index 18885230e389f68388441b89a8202c3757abc0cc..9dbee932d26733dff1201732b3fb4b370db60765 100644 (file)
@@ -1160,6 +1160,8 @@ struct pg_merge_meta_t {
 };
 WRITE_CLASS_ENCODER(pg_merge_meta_t)
 
+class OSDMap;
+
 /*
  * pg_pool
  */
@@ -1464,6 +1466,15 @@ public:
     return peering_crush_bucket_count != 0;
   }
 
+  bool stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+                           std::ostream *out) const;
+  bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap,
+                           std::ostream *out) const {
+    set<int> swant;
+    for (auto i : want) swant.insert(i);
+    return stretch_set_can_peer(swant, osdmap, out);
+  }
+
   uint64_t target_max_bytes = 0;   ///< tiering: target max pool size
   uint64_t target_max_objects = 0; ///< tiering: target max pool size
 
@@ -3066,7 +3077,6 @@ struct pg_fast_info_t {
 WRITE_CLASS_ENCODER(pg_fast_info_t)
 
 
-class OSDMap;
 /**
  * PastIntervals -- information needed to determine the PriorSet and
  * the might_have_unfound set
@@ -3202,6 +3212,14 @@ public:
     bool new_sort_bitwise,
     bool old_recovery_deletes,
     bool new_recovery_deletes,
+    uint32_t old_crush_count,
+    uint32_t new_crush_count,
+    uint32_t old_crush_target,
+    uint32_t new_crush_target,
+    uint32_t old_crush_barrier,
+    uint32_t new_crush_barrier,
+    int32_t old_crush_member,
+    int32_t new_crush_member,
     pg_t pgid
     );
 
index 816d358ad12c9688d9925f4fedb98fa1d8611727..0cf592580c3c65a8bace1ec48b14b98b0ef3d7cd 100644 (file)
@@ -2797,6 +2797,14 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
        sort_bitwise,
        t->recovery_deletes,
        recovery_deletes,
+       t->peering_crush_bucket_count,
+       pi->peering_crush_bucket_count,
+       t->peering_crush_bucket_target,
+       pi->peering_crush_bucket_target,
+       t->peering_crush_bucket_barrier,
+       pi->peering_crush_bucket_barrier,
+       t->peering_crush_mandatory_member,
+       pi->peering_crush_mandatory_member,
        prev_pgid)) {
     force_resend = true;
   }
@@ -2848,6 +2856,10 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
     t->actual_pgid = spgid;
     t->sort_bitwise = sort_bitwise;
     t->recovery_deletes = recovery_deletes;
+    t->peering_crush_bucket_count = pi->peering_crush_bucket_count;
+    t->peering_crush_bucket_target = pi->peering_crush_bucket_target;
+    t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier;
+    t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member;
     ldout(cct, 10) << __func__ << " "
                   << " raw pgid " << pgid << " -> actual " << t->actual_pgid
                   << " acting " << acting
index 18f17d53ac6e802f957a71fe022630f2e303cfd9..85e08eab547e8a91233d6cd444bf3cb8a3979b52 100644 (file)
@@ -1778,6 +1778,10 @@ public:
     int min_size = -1; ///< the min size of the pool when were were last mapped
     bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
     bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
+    uint32_t peering_crush_bucket_count = 0;
+    uint32_t peering_crush_bucket_target = 0;
+    uint32_t peering_crush_bucket_barrier = 0;
+    int32_t peering_crush_mandatory_member = 0;
 
     bool used_replica = false;
     bool paused = false;