osd/PeeringState: fix pending want_acting vs osd offline race

author xie xingguo <xie.xingguo@zte.com.cn>

Thu, 12 Mar 2020 01:41:10 +0000 (09:41 +0800)

committer xie xingguo <xie.xingguo@zte.com.cn>

Tue, 17 Mar 2020 04:07:28 +0000 (12:07 +0800)
author xie xingguo <xie.xingguo@zte.com.cn>
Thu, 12 Mar 2020 01:41:10 +0000 (09:41 +0800)
committer xie xingguo <xie.xingguo@zte.com.cn>
Tue, 17 Mar 2020 04:07:28 +0000 (12:07 +0800)
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc

index 685334cc09abef2ab8ec53d68676b4fbdba81e4a..4976d520a047a1c3a0fc6a36f6f1f29a30f93d62 100644 (file)
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -5575,13 +5575,32 @@ boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
      ps->share_pg_info();
    }
  
+  bool need_acting_change = false;
    for (size_t i = 0; i < ps->want_acting.size(); i++) {
      int osd = ps->want_acting[i];
      if (!advmap.osdmap->is_up(osd)) {
        pg_shard_t osd_with_shard(osd, shard_id_t(i));
-      ceph_assert(ps->is_acting(osd_with_shard) || ps->is_up(osd_with_shard));
+      if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
+        psdout(10) << "Active stray osd." << osd << " in want_acting is down"
+                   << dendl;
+        need_acting_change = true;
+      }
      }
    }
+  if (need_acting_change) {
+     psdout(10) << "Active need acting change, call choose_acting again"
+                << dendl;
+    // possibly because we re-add some strays into the acting set and
+    // some of them then go down in a subsequent map before we could see
+    // the map changing the pg temp.
+    // call choose_acting again to clear them out.
+    // note that we leave restrict_to_up_acting to false in order to
+    // not overkill any chosen stray that is still alive.
+    pg_shard_t auth_log_shard;
+    bool history_les_bound = false;
+    ps->remove_down_peer_info(advmap.osdmap);
+    ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+  }
  
    /* Check for changes in pool size (if the acting set changed as a result,
     * this does not matter) */
author	xie xingguo <xie.xingguo@zte.com.cn>
	Thu, 12 Mar 2020 01:41:10 +0000 (09:41 +0800)
committer	xie xingguo <xie.xingguo@zte.com.cn>
	Tue, 17 Mar 2020 04:07:28 +0000 (12:07 +0800)