From 9524bdbddae6b8e9de0239bee46bcc7c40453821 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sun, 24 Feb 2019 10:52:05 -0800 Subject: [PATCH] mon: add freeze MDS command This is a new hidden command that allows us to do certain testing for race conditions. A frozen MDS cannot change change state or be replaced by a standby. Signed-off-by: Patrick Donnelly --- src/mds/FSMap.cc | 34 ++++++++++++++--------- src/mds/FSMap.h | 2 ++ src/mds/MDSMap.cc | 10 ++++++- src/mds/MDSMap.h | 57 ++++++++++++++++++++++++--------------- src/mds/MDSRank.cc | 10 +------ src/mds/mdstypes.cc | 1 - src/mds/mdstypes.h | 9 ++++--- src/mon/CMakeLists.txt | 1 + src/mon/CommandHandler.cc | 43 +++++++++++++++++++++++++++++ src/mon/CommandHandler.h | 35 ++++++++++++++++++++++++ src/mon/FSCommands.cc | 24 ----------------- src/mon/FSCommands.h | 15 ++--------- src/mon/MDSMonitor.cc | 32 +++++++++++++++++++++- src/mon/MDSMonitor.h | 3 ++- src/mon/MonCommands.h | 3 +++ 15 files changed, 191 insertions(+), 88 deletions(-) create mode 100644 src/mon/CommandHandler.cc create mode 100644 src/mon/CommandHandler.h diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index 74a340fc700..e395f2dcbcc 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -711,32 +711,40 @@ void Filesystem::print(std::ostream &out) const mds_map.print(out); } -mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const +mds_gid_t FSMap::get_available_standby() const { - auto&& fs = get_filesystem(role.fscid); - - // First see if we have a STANDBY_REPLAY - for (const auto& [gid, info] : fs->mds_map.mds_info) { - if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) { - return gid; - } - } - - // See if there are any STANDBY daemons available for (const auto& [gid, info] : standby_daemons) { ceph_assert(info.rank == MDS_RANK_NONE); ceph_assert(info.state == MDSMap::STATE_STANDBY); - if (info.laggy()) { + if (info.laggy() || info.is_frozen()) { continue; } return gid; } - return MDS_GID_NONE; } +mds_gid_t FSMap::find_replacement_for(mds_role_t role, std::string_view name) const +{ + auto&& fs = get_filesystem(role.fscid); + + // First see if we have a STANDBY_REPLAY + for (const auto& [gid, info] : fs->mds_map.mds_info) { + if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) { + if (info.is_frozen()) { + /* the standby-replay is frozen, do nothing! */ + return MDS_GID_NONE; + } else { + return gid; + } + } + } + + return get_available_standby(); +} + void FSMap::sanity() const { if (legacy_client_fscid != FS_CLUSTER_ID_NONE) { diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index 46f445ff4fe..36d024c2e36 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -193,6 +193,8 @@ public: return result; } + mds_gid_t get_available_standby() const; + /** * Resolve daemon name to GID */ diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index c29b283570a..889d35b7f1e 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -89,6 +89,7 @@ void MDSMap::mds_info_t::dump(Formatter *f) const } f->close_section(); f->dump_unsigned("features", mds_features); + f->dump_unsigned("flags", flags); } void MDSMap::mds_info_t::print_summary(ostream &out) const @@ -106,6 +107,9 @@ void MDSMap::mds_info_t::print_summary(ostream &out) const if (!export_targets.empty()) { out << " export_targets=" << export_targets; } + if (is_frozen()) { + out << " frozen"; + } } void MDSMap::mds_info_t::generate_test_instances(list& ls) @@ -504,7 +508,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const { - __u8 v = 8; + __u8 v = 9; if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { v = 7; } @@ -527,6 +531,7 @@ void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) con encode(mds_features, bl); encode(FS_CLUSTER_ID_NONE, bl); /* standby_for_fscid */ encode(false, bl); + encode(flags, bl); ENCODE_FINISH(bl); } @@ -579,6 +584,9 @@ void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl) bool standby_replay; decode(standby_replay, bl); } + if (struct_v >= 8) { + decode(flags, bl); + } DECODE_FINISH(bl); } diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 6916d0ad2be..5ba266a4294 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -110,12 +110,24 @@ public: utime_t laggy_since; std::set export_targets; uint64_t mds_features = 0; + uint64_t flags = 0; + enum mds_flags : uint64_t { + FROZEN = 1 << 0, + }; mds_info_t() = default; bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } + bool is_degraded() const { + return STATE_REPLAY <= state && state <= STATE_CLIENTREPLAY; + } + + void freeze() { flags |= mds_flags::FROZEN; } + void unfreeze() { flags &= ~mds_flags::FROZEN; } + bool is_frozen() const { return flags&mds_flags::FROZEN; } + const entity_addrvec_t& get_addrs() const { return addrs; } @@ -541,29 +553,33 @@ public: return is_clientreplay(m) || is_active(m) || is_stopping(m); } + mds_gid_t get_standby_replay(mds_rank_t r) const { + for (auto& [gid,info] : mds_info) { + if (info.rank == r && info.state == STATE_STANDBY_REPLAY) { + return gid; + } + } + return MDS_GID_NONE; + } + bool has_standby_replay(mds_rank_t r) const { + return get_standby_replay(r) != MDS_GID_NONE; + } + bool is_followable(mds_rank_t r) const { - bool has_followable_rank = false; - for (const auto& p : mds_info) { - auto& info = p.second; - if (info.rank == r) { - if (info.state == STATE_ACTIVE) { - has_followable_rank = true; - } else { - return false; + if (auto it1 = up.find(r); it1 != up.end()) { + if (auto it2 = mds_info.find(it1->second); it2 != mds_info.end()) { + auto& info = it2->second; + if (!info.is_degraded() && !has_standby_replay(r)) { + return true; } } - if (p.second.state == STATE_STANDBY_REPLAY) { - return false; - } } - return has_followable_rank; + return false; } bool is_laggy_gid(mds_gid_t gid) const { - if (!mds_info.count(gid)) - return false; - std::map::const_iterator p = mds_info.find(gid); - return p->second.laggy(); + auto it = mds_info.find(gid); + return it == mds_info.end() ? false : it->second.laggy(); } // degraded = some recovery in process. fixes active membership and @@ -571,11 +587,10 @@ public: bool is_degraded() const { if (!failed.empty() || !damaged.empty()) return true; - for (std::map::const_iterator p = mds_info.begin(); - p != mds_info.end(); - ++p) - if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_CLIENTREPLAY) - return true; + for (const auto& p : mds_info) { + if (p.second.is_degraded()) + return true; + } return false; } bool is_any_failed() const { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index ad4a4e50e80..be90b70f57a 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2393,15 +2393,7 @@ void MDSRankDispatcher::handle_mds_map( std::mem_fn(&OSDMap::get_epoch))); /* Now check if we should hint to the OSD that a read may follow */ - bool found = false; - for (const auto& p : mdsmap->get_mds_info()) { - auto& info = p.second; - if (info.state == MDSMap::STATE_STANDBY_REPLAY && info.rank == whoami) { - found = true; - break; - } - } - if (found) + if (mdsmap->has_standby_replay(whoami)) mdlog->set_write_iohint(0); else mdlog->set_write_iohint(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 2c671b5d0ae..20a2d2d5788 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -6,7 +6,6 @@ #include "common/Formatter.h" const mds_gid_t MDS_GID_NONE = mds_gid_t(0); -const mds_rank_t MDS_RANK_NONE = mds_rank_t(-1); /* diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 2130ace30b3..49f6944e4bf 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -75,14 +75,15 @@ typedef int32_t mds_rank_t; -typedef int32_t fs_cluster_id_t; +constexpr mds_rank_t MDS_RANK_NONE = -1; BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) extern const mds_gid_t MDS_GID_NONE; -constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1}; + +typedef int32_t fs_cluster_id_t; +constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1; // The namespace ID of the anonymous default filesystem from legacy systems -constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = {0}; -extern const mds_rank_t MDS_RANK_NONE; +constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0; class mds_role_t { diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt index 02b459e214b..23396f063bd 100644 --- a/src/mon/CMakeLists.txt +++ b/src/mon/CMakeLists.txt @@ -7,6 +7,7 @@ set(lib_mon_srcs PaxosService.cc OSDMonitor.cc MDSMonitor.cc + CommandHandler.cc FSCommands.cc MgrMonitor.cc MgrStatMonitor.cc diff --git a/src/mon/CommandHandler.cc b/src/mon/CommandHandler.cc new file mode 100644 index 00000000000..903d359272c --- /dev/null +++ b/src/mon/CommandHandler.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "CommandHandler.h" + +#include "common/strtol.h" +#include "include/ceph_assert.h" + +#include +#include +#include + +int CommandHandler::parse_bool(std::string_view str, bool* result, std::ostream& ss) +{ + ceph_assert(result != nullptr); + + std::string interr; + int64_t n = strict_strtoll(str.data(), 10, &interr); + + if (str == "false" || str == "no" + || (interr.length() == 0 && n == 0)) { + *result = false; + return 0; + } else if (str == "true" || str == "yes" + || (interr.length() == 0 && n == 1)) { + *result = true; + return 0; + } else { + ss << "value must be false|no|0 or true|yes|1"; + return -EINVAL; + } +} diff --git a/src/mon/CommandHandler.h b/src/mon/CommandHandler.h new file mode 100644 index 00000000000..167b4587f91 --- /dev/null +++ b/src/mon/CommandHandler.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef COMMAND_HANDLER_H_ +#define COMMAND_HANDLER_H_ + +#include +#include + +class CommandHandler +{ +public: + /** + * Parse true|yes|1 style boolean string from `bool_str` + * `result` must be non-null. + * `ss` will be populated with error message on error. + * + * @return 0 on success, else -EINVAL + */ + int parse_bool(std::string_view str, bool* result, std::ostream& ss); +}; + +#endif diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 3cc78c0f30f..5dd9799c2aa 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -974,30 +974,6 @@ FileSystemCommandHandler::load(Paxos *paxos) return handlers; } -int FileSystemCommandHandler::parse_bool( - const std::string &bool_str, - bool *result, - std::ostream &ss) -{ - ceph_assert(result != nullptr); - - string interr; - int64_t n = strict_strtoll(bool_str.c_str(), 10, &interr); - - if (bool_str == "false" || bool_str == "no" - || (interr.length() == 0 && n == 0)) { - *result = false; - return 0; - } else if (bool_str == "true" || bool_str == "yes" - || (interr.length() == 0 && n == 1)) { - *result = true; - return 0; - } else { - ss << "value must be false|no|0 or true|yes|1"; - return -EINVAL; - } -} - int FileSystemCommandHandler::_check_pool( OSDMap &osd_map, const int64_t pool_id, diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h index f7066c49f86..69662d2b29e 100644 --- a/src/mon/FSCommands.h +++ b/src/mon/FSCommands.h @@ -17,6 +17,7 @@ #define FS_COMMANDS_H_ #include "Monitor.h" +#include "CommandHandler.h" #include "osd/OSDMap.h" #include "mds/FSMap.h" @@ -24,23 +25,11 @@ #include #include -class FileSystemCommandHandler +class FileSystemCommandHandler : protected CommandHandler { protected: std::string prefix; - /** - * Parse true|yes|1 style boolean string from `bool_str` - * `result` must be non-null. - * `ss` will be populated with error message on error. - * - * @return 0 on success, else -EINVAL - */ - int parse_bool( - const std::string &bool_str, - bool *result, - std::ostream &ss); - /** * Return 0 if the pool is suitable for use with CephFS, or * in case of errors return a negative error code, and populate diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 521de9baa14..9739e7176eb 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -1407,6 +1407,34 @@ int MDSMonitor::filesystem_command( dout(1) << "repaired: no-op on rank " << role << dendl; } + r = 0; + } else if (prefix == "mds freeze") { + std::string who; + cmd_getval(g_ceph_context, cmdmap, "role_or_gid", who); + mds_gid_t gid = gid_from_arg(fsmap, who, ss); + if (gid == MDS_GID_NONE) { + return -EINVAL; + } + + bool freeze = false; + { + std::string str; + cmd_getval(g_ceph_context, cmdmap, "val", str); + if ((r = parse_bool(str, &freeze, ss)) != 0) { + return r; + } + } + + auto f = [freeze,gid,&ss](auto& info) { + if (freeze) { + ss << "freezing mds." << gid; + info.freeze(); + } else { + ss << "unfreezing mds." << gid; + info.unfreeze(); + } + }; + fsmap.modify_daemon(gid, f); r = 0; } else { return -ENOSYS; @@ -1794,6 +1822,7 @@ void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid, } mono_time now = mono_clock::now(); chrono::duration since = now-latest_beacon; + const bool frozen = info.is_frozen(); const bool may_replace = since.count() < std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5); @@ -1804,6 +1833,7 @@ void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid, info.state != MDSMap::STATE_STANDBY && info.state != MDSMap::STATE_STANDBY_REPLAY && may_replace && + !frozen && !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) && (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name)) != MDS_GID_NONE) { @@ -1832,7 +1862,7 @@ void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid, *mds_propose = true; } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY || - info.state == MDSMap::STATE_STANDBY) && may_replace) { + info.state == MDSMap::STATE_STANDBY) && may_replace && !frozen) { dout(1) << " failing and removing " << gid << " " << info.addrs << " mds." << info.rank << "." << info.inc << " " << ceph_mds_state_name(info.state) diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h index 87ce40cc022..367f3c7d92e 100644 --- a/src/mon/MDSMonitor.h +++ b/src/mon/MDSMonitor.h @@ -26,13 +26,14 @@ #include "PaxosService.h" #include "msg/Messenger.h" #include "messages/MMDSBeacon.h" +#include "CommandHandler.h" class MMonCommand; class MMDSLoadTargets; class MMDSMap; class FileSystemCommandHandler; -class MDSMonitor : public PaxosService, public PaxosFSMap { +class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHandler { public: MDSMonitor(Monitor *mn, Paxos *p, string service_name); diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 15326c09808..d7a94be696f 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -322,6 +322,9 @@ COMMAND_WITH_FLAG("mds set " \ "name=val,type=CephString " \ "name=yes_i_really_mean_it,type=CephBool,req=false", \ "set mds parameter to ", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds freeze name=role_or_gid,type=CephString" + " name=val,type=CephString", + "freeze MDS yes/no", "mds", "rw", FLAG(HIDDEN)) // arbitrary limit 0-20 below; worth standing on head to make it // relate to actual state definitions? // #include "include/ceph_fs.h" -- 2.39.5