From 88bb6964c9f3f54a0eb6bc1f680997f5b5ffa314 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 23 Mar 2019 15:52:04 -0500 Subject: [PATCH] mon: add 'mon ok-to-{stop,add-offline,rm}' commands Helpers to decide when it is safe to stop a mon, add a mon that is not started, or remove a mon. (Adding and start a mon would always be safe, but it takes time to sync, so it's not really possible to do quickly.) Signed-off-by: Sage Weil (cherry picked from commit cfba0acc01230423252f674d11929bf505dc1ec7) --- qa/standalone/misc/ok-to-stop.sh | 207 +++++++++++++++++++++++++++++++ src/mon/MonCommands.h | 12 ++ src/mon/Monitor.cc | 58 ++++++++- 3 files changed, 276 insertions(+), 1 deletion(-) create mode 100755 qa/standalone/misc/ok-to-stop.sh diff --git a/qa/standalone/misc/ok-to-stop.sh b/qa/standalone/misc/ok-to-stop.sh new file mode 100755 index 0000000000000..9894d3bf4bfd7 --- /dev/null +++ b/qa/standalone/misc/ok-to-stop.sh @@ -0,0 +1,207 @@ +#!/usr/bin/env bash + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON_A="127.0.0.1:7150" # git grep '\<7150\>' : there must be only one + export CEPH_MON_B="127.0.0.1:7151" # git grep '\<7151\>' : there must be only one + export CEPH_MON_C="127.0.0.1:7152" # git grep '\<7152\>' : there must be only one + export CEPH_MON_D="127.0.0.1:7153" # git grep '\<7153\>' : there must be only one + export CEPH_MON_E="127.0.0.1:7154" # git grep '\<7154\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + export ORIG_CEPH_ARGS="$CEPH_ARGS" + + local funcs=${@:-$(set | ${SED} -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + kill_daemons $dir KILL || return 1 + teardown $dir || return 1 + done +} + +function TEST_1_mon_checks() { + local dir=$1 + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A " + + run_mon $dir a --public-addr=$CEPH_MON_A || return 1 + + ceph mon ok-to-stop dne || return 1 + ! ceph mon ok-to-stop a || return 1 + + ! ceph mon ok-to-add-offline || return 1 + + ! ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm dne || return 1 +} + +function TEST_2_mons_checks() { + local dir=$1 + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B " + + run_mon $dir a --public-addr=$CEPH_MON_A || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + + ceph mon ok-to-stop dne || return 1 + ! ceph mon ok-to-stop a || return 1 + ! ceph mon ok-to-stop b || return 1 + ! ceph mon ok-to-stop a b || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm dne || return 1 +} + +function TEST_3_mons_checks() { + local dir=$1 + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C " + + run_mon $dir a --public-addr=$CEPH_MON_A || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + run_mon $dir c --public-addr=$CEPH_MON_C || return 1 + wait_for_quorum 60 3 + + ceph mon ok-to-stop dne || return 1 + ceph mon ok-to-stop a || return 1 + ceph mon ok-to-stop b || return 1 + ceph mon ok-to-stop c || return 1 + ! ceph mon ok-to-stop a b || return 1 + ! ceph mon ok-to-stop b c || return 1 + ! ceph mon ok-to-stop a b c || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm c || return 1 + + kill_daemons $dir KILL mon.b + wait_for_quorum 60 2 + + ! ceph mon ok-to-stop a || return 1 + ceph mon ok-to-stop b || return 1 + ! ceph mon ok-to-stop c || return 1 + + ! ceph mon ok-to-add-offline || return 1 + + ! ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ! ceph mon ok-to-rm c || return 1 +} + +function TEST_4_mons_checks() { + local dir=$1 + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D " + + run_mon $dir a --public-addr=$CEPH_MON_A || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + run_mon $dir c --public-addr=$CEPH_MON_C || return 1 + run_mon $dir d --public-addr=$CEPH_MON_D || return 1 + wait_for_quorum 60 4 + + ceph mon ok-to-stop dne || return 1 + ceph mon ok-to-stop a || return 1 + ceph mon ok-to-stop b || return 1 + ceph mon ok-to-stop c || return 1 + ceph mon ok-to-stop d || return 1 + ! ceph mon ok-to-stop a b || return 1 + ! ceph mon ok-to-stop c d || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm c || return 1 + + kill_daemons $dir KILL mon.a + wait_for_quorum 60 3 + + ceph mon ok-to-stop a || return 1 + ! ceph mon ok-to-stop b || return 1 + ! ceph mon ok-to-stop c || return 1 + ! ceph mon ok-to-stop d || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm c || return 1 + ceph mon ok-to-rm d || return 1 +} + +function TEST_5_mons_checks() { + local dir=$1 + + CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E " + + run_mon $dir a --public-addr=$CEPH_MON_A || return 1 + run_mon $dir b --public-addr=$CEPH_MON_B || return 1 + run_mon $dir c --public-addr=$CEPH_MON_C || return 1 + run_mon $dir d --public-addr=$CEPH_MON_D || return 1 + run_mon $dir e --public-addr=$CEPH_MON_E || return 1 + wait_for_quorum 60 5 + + ceph mon ok-to-stop dne || return 1 + ceph mon ok-to-stop a || return 1 + ceph mon ok-to-stop b || return 1 + ceph mon ok-to-stop c || return 1 + ceph mon ok-to-stop d || return 1 + ceph mon ok-to-stop e || return 1 + ceph mon ok-to-stop a b || return 1 + ceph mon ok-to-stop c d || return 1 + ! ceph mon ok-to-stop a b c || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm c || return 1 + ceph mon ok-to-rm d || return 1 + ceph mon ok-to-rm e || return 1 + + kill_daemons $dir KILL mon.a + wait_for_quorum 60 4 + + ceph mon ok-to-stop a || return 1 + ceph mon ok-to-stop b || return 1 + ceph mon ok-to-stop c || return 1 + ceph mon ok-to-stop d || return 1 + ceph mon ok-to-stop e || return 1 + + ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ceph mon ok-to-rm b || return 1 + ceph mon ok-to-rm c || return 1 + ceph mon ok-to-rm d || return 1 + ceph mon ok-to-rm e || return 1 + + kill_daemons $dir KILL mon.e + wait_for_quorum 60 3 + + ceph mon ok-to-stop a || return 1 + ! ceph mon ok-to-stop b || return 1 + ! ceph mon ok-to-stop c || return 1 + ! ceph mon ok-to-stop d || return 1 + ceph mon ok-to-stop e || return 1 + + ! ceph mon ok-to-add-offline || return 1 + + ceph mon ok-to-rm a || return 1 + ! ceph mon ok-to-rm b || return 1 + ! ceph mon ok-to-rm c || return 1 + ! ceph mon ok-to-rm d || return 1 + ceph mon ok-to-rm e || return 1 +} + +main ok-to-stop "$@" diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index fdbee9dde1689..4a1a58a5c0c68 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -221,6 +221,18 @@ COMMAND("features", "report of connected features", \ "mon", "r") COMMAND("quorum_status", "report status of monitor quorum", \ "mon", "r") +COMMAND("mon ok-to-stop " \ + "name=ids,type=CephString,n=N", + "check whether mon(s) can be safely stopped without reducing immediate " \ + "availability", + "mon", "r") +COMMAND("mon ok-to-add-offline", + "check whether adding a mon and not starting it would break quorum", + "mon", "r") +COMMAND("mon ok-to-rm " \ + "name=id,type=CephString", + "check whether removing the specified mon would break quorum", + "mon", "r") COMMAND_WITH_FLAG("mon_status", "report status of monitors", "mon", "r", FLAG(NOFORWARD)) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index e978de4c420d5..336136db6dece 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3367,7 +3367,10 @@ void Monitor::handle_command(MonOpRequestRef op) prefix != "mon sync force" && prefix != "mon metadata" && prefix != "mon versions" && - prefix != "mon count-metadata") { + prefix != "mon count-metadata" && + prefix != "mon ok-to-stop" && + prefix != "mon ok-to-add-offline" && + prefix != "mon ok-to-rm") { monmon()->dispatch(op); return; } @@ -3697,6 +3700,59 @@ void Monitor::handle_command(MonOpRequestRef op) rdata.append(ds); rs = ""; r = 0; + } else if (prefix == "mon ok-to-stop") { + vector ids; + if (!cmd_getval(g_ceph_context, cmdmap, "ids", ids)) { + r = -EINVAL; + goto out; + } + set wouldbe; + for (auto rank : quorum) { + wouldbe.insert(monmap->get_name(rank)); + } + for (auto& n : ids) { + if (monmap->contains(n)) { + wouldbe.erase(n); + } + } + if (wouldbe.size() < monmap->size() / 2 + 1) { + r = -EBUSY; + rs = "not enough monitors would be available (" + stringify(wouldbe) + + ") after stopping mons " + stringify(ids); + goto out; + } + r = 0; + rs = "quorum should be preserved (" + stringify(wouldbe) + + ") after stopping " + stringify(ids); + } else if (prefix == "mon ok-to-add-offline") { + if (quorum.size() < (monmap->size() + 1) / 2 + 1) { + rs = "adding a monitor may break quorum (until that monitor starts)"; + r = -EBUSY; + goto out; + } + rs = "adding another mon that is not yet online will not break quorum"; + r = 0; + } else if (prefix == "mon ok-to-rm") { + string id; + if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) { + r = -EINVAL; + rs = "must specify a monitor id"; + goto out; + } + if (!monmap->contains(id)) { + r = 0; + rs = "mon." + id + " does not exist"; + goto out; + } + int rank = monmap->get_rank(id); + if (quorum.count(rank) && + quorum.size() - 1 < (monmap->size() - 1) / 2 + 1) { + r = -EBUSY; + rs = "removing mon." + id + " would break quorum"; + goto out; + } + r = 0; + rs = "safe to remove mon." + id; } else if (prefix == "mon_status") { get_mon_status(f.get(), ds); if (f) -- 2.39.5