From: xie xingguo <xie.xingguo@zte.com.cn>
Date: Fri, 14 Feb 2020 10:26:52 +0000 (+0800)
Subject: osd/PeeringState: restart peering on any previous down acting member coming back
X-Git-Tag: v15.1.1~311^2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=023524a26d7e12e7ddfc3537582b1a1cb03af69e;p=ceph.git

osd/PeeringState: restart peering on any previous down acting member coming back

One of our customers wants to verify the data safety of Ceph during scaling
the cluster up, and the test case looks like:
- keep checking the status of a speficied pg, who's up is [1, 2, 3]
- add more osds: up [1, 2, 3] -> up [1, 4, 5], acting = [1, 2, 3], backfill_targets = [4, 5],
  pg is remapped
- stop osd.2: up [1, 4, 5], acting = [1, 3], backfill_targets = [4, 5], pg is undersized
- restart osd.2, acting will stay unchanged as 2 belongs to neither current up nor acting set,
  hence leaving the corresponding pg pinning undersized for a long time until all backfill
  targets completes

It does not pose any critical problem -- we'll end up getting that pg back into active + clean,
except that the long live DEGRADED warnings keep bothering our customer who cares about data
safety more than any thing else.

The right way to achieve the above goal is for:

	boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)

to check whether the newly booted node could be validly chosen for the acting set and
request a new temp mapping. The new temp mapping would then trigger a real interval change
that will get rid of the DEGRADED warning.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
Signed-off-by: Yan Jun <yan.jun8@zte.com.cn>
---

diff --git a/qa/standalone/osd/repeer-on-acting-back.sh b/qa/standalone/osd/repeer-on-acting-back.sh
new file mode 100755
index 0000000000000..af406ef926b28
--- /dev/null
+++ b/qa/standalone/osd/repeer-on-acting-back.sh
@@ -0,0 +1,129 @@
+#!/usr/bin/env bash
+#
+# Copyright (C) 2020  ZTE Corporation <contact@zte.com.cn>
+#
+# Author: xie xingguo <xie.xingguo@zte.com.cn>
+# Author: Yan Jun <yan.jun8@zte.com.cn>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export poolname=test
+    export testobjects=100
+    export loglen=12
+    export trim=$(expr $loglen / 2)
+    export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+    # so we will not force auth_log_shard to be acting_primary
+    CEPH_ARGS+="--osd_force_auth_primary_missing_objects=1000000 "
+    # use small pg_log settings, so we always do backfill instead of recovery
+    CEPH_ARGS+="--osd_min_pg_log_entries=$loglen --osd_max_pg_log_entries=$loglen --osd_pg_log_trim_min=$trim "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+
+function TEST_repeer_on_down_acting_member_coming_back() {
+    local dir=$1
+    local dummyfile='/etc/fstab'
+
+    local num_osds=6
+    local osds="$(seq 0 $(expr $num_osds - 1))"
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for i in $osds
+    do
+      run_osd $dir $i || return 1
+    done
+
+    create_pool $poolname 1 1
+    ceph osd pool set $poolname size 3
+    ceph osd pool set $poolname min_size 2
+    local poolid=$(ceph pg dump pools -f json | jq '.pool_stats' | jq '.[].poolid')
+    local pgid=$poolid.0
+
+    # enable required feature-bits for upmap
+    ceph osd set-require-min-compat-client luminous
+    # reset up to [1,2,3]
+    ceph osd pg-upmap $pgid 1 2 3 || return 1
+
+    flush_pg_stats || return 1
+    wait_for_clean || return 1
+
+    echo "writing initial objects"
+    # write a bunch of objects
+    for i in $(seq 1 $testobjects)
+    do
+      rados -p $poolname put existing_$i $dummyfile
+    done
+
+    WAIT_FOR_CLEAN_TIMEOUT=20 wait_for_clean
+
+    # reset up to [1,4,5]
+    ceph osd pg-upmap $pgid 1 4 5 || return 1
+
+    # wait for peering to complete
+    sleep 2
+
+    # make sure osd.2 belongs to current acting set
+    ceph pg $pgid query | jq '.acting' | grep 2 || return 1
+
+    # kill osd.2
+    kill_daemons $dir KILL osd.2 || return 1
+    ceph osd down osd.2
+
+    # again, wait for peering to complete
+    sleep 2
+
+    # osd.2 should have been moved out from acting set
+    ceph pg $pgid query | jq '.acting' | grep 2 && return 1
+
+    # bring up osd.2
+    activate_osd $dir 2 || return 1
+    wait_for_osd up 2
+
+    # again, wait for peering to complete
+    sleep 2
+
+    # primary should be able to re-add osd.2 into acting
+    ceph pg $pgid query | jq '.acting' | grep 2 || return 1
+
+    WAIT_FOR_CLEAN_TIMEOUT=20 wait_for_clean
+
+    if ! grep -q "Active: got notify from previous acting member.*, requesting pg_temp change" $(find $dir -name '*osd*log')
+    then
+            echo failure
+            return 1
+    fi
+    echo "success"
+
+    delete_pool $poolname
+    kill_daemons $dir || return 1
+}
+
+main repeer-on-acting-back "$@"
+
+# Local Variables:
+# compile-command: "make -j4 && ../qa/run-standalone.sh repeer-on-acting-back.sh"
+# End:
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index aa9e047c93371..342fe8e637bb4 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -1996,7 +1996,8 @@ void PeeringState::choose_async_recovery_replicated(
  */
 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
 				 bool restrict_to_up_acting,
-				 bool *history_les_bound)
+				 bool *history_les_bound,
+				 bool request_pg_temp_change_only)
 {
   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
   all_info[pg_whoami] = info;
@@ -2105,6 +2106,8 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
     }
     return false;
   }
+  if (request_pg_temp_change_only)
+    return true;
   want_acting.clear();
   acting_recovery_backfill = want_acting_backfill;
   psdout(10) << "acting_recovery_backfill is "
@@ -5653,6 +5656,16 @@ boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
       ps->discover_all_missing(
 	context<PeeringMachine>().get_recovery_ctx().msgs);
     }
+    // check if it is a previous down acting member that's coming back.
+    // if so, request pg_temp change to trigger a new interval transition
+    pg_shard_t auth_log_shard;
+    bool history_les_bound = false;
+    ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+    if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
+      psdout(10) << "Active: got notify from previous acting member "
+                 << notevt.from << ", requesting pg_temp change"
+                 << dendl;
+    }
   }
   return discard_event();
 }
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index 327e35eeb975a..0de153e48d86f 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -1556,7 +1556,8 @@ public:
   bool recoverable(const vector<int> &want) const;
   bool choose_acting(pg_shard_t &auth_log_shard,
 		     bool restrict_to_up_acting,
-		     bool *history_les_bound);
+		     bool *history_les_bound,
+		     bool request_pg_temp_change_only = false);
 
   bool search_for_missing(
     const pg_info_t &oinfo, const pg_missing_t &omissing,