From 49c1be8774529f2b701effae36d1ecee9bb85b06 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Jul 2017 15:32:20 -0400 Subject: [PATCH] mon: collect mon metadata as part of election Previously each peon would send a message to teh leader with its metadata immediately after the election was won. However, at that point paxos usually wasn't writeable, which meant the old update_mon_metadata() method didn't persist reliably, updates would race, and generally speaking metadata wasn't reliably updated. Fix this by including metadata as part of the election ack, and persisting the whole quorum when the election is won. This ensures it is up to date. Fixes: http://tracker.ceph.com/issues/20434 Signed-off-by: Sage Weil --- src/messages/MMonElection.h | 28 +++++++++++++------------- src/mon/Elector.cc | 13 ++++++++---- src/mon/Elector.h | 5 +++-- src/mon/Monitor.cc | 40 ++++++++++++++++++++++++++++++------- src/mon/Monitor.h | 1 + 5 files changed, 60 insertions(+), 27 deletions(-) diff --git a/src/messages/MMonElection.h b/src/messages/MMonElection.h index 79503875e26..c9b87c451ec 100644 --- a/src/messages/MMonElection.h +++ b/src/messages/MMonElection.h @@ -22,7 +22,7 @@ class MMonElection : public Message { - static const int HEAD_VERSION = 6; + static const int HEAD_VERSION = 7; static const int COMPAT_VERSION = 5; public: @@ -48,25 +48,19 @@ public: uint64_t quorum_features; mon_feature_t mon_features; bufferlist sharing_bl; - /* the following were both used in the next branch for a while - * on user cluster, so we've left them in for compatibility. */ - version_t defunct_one; - version_t defunct_two; + map metadata; MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), op(0), epoch(0), quorum_features(0), - mon_features(0), - defunct_one(0), - defunct_two(0) + mon_features(0) { } MMonElection(int o, epoch_t e, MonMap *m) : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), fsid(m->fsid), op(o), epoch(e), quorum_features(0), - mon_features(0), - defunct_one(0), defunct_two(0) + mon_features(0) { // encode using full feature set; we will reencode for dest later, // if necessary @@ -96,10 +90,11 @@ public: ::encode(monmap_bl, payload); ::encode(quorum, payload); ::encode(quorum_features, payload); - ::encode(defunct_one, payload); - ::encode(defunct_two, payload); + ::encode((version_t)0, payload); // defunct + ::encode((version_t)0, payload); // defunct ::encode(sharing_bl, payload); ::encode(mon_features, payload); + ::encode(metadata, payload); } void decode_payload() override { bufferlist::iterator p = payload.begin(); @@ -109,11 +104,16 @@ public: ::decode(monmap_bl, p); ::decode(quorum, p); ::decode(quorum_features, p); - ::decode(defunct_one, p); - ::decode(defunct_two, p); + { + version_t v; // defunct fields from old encoding + ::decode(v, p); + ::decode(v, p); + } ::decode(sharing_bl, p); if (header.version >= 6) ::decode(mon_features, p); + if (header.version >= 7) + ::decode(metadata, p); } }; diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 303510530b5..a2244a3c6da 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -87,6 +87,7 @@ void Elector::start() electing_me = true; acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL; acked_me[mon->rank].mon_features = ceph::features::mon::get_supported(); + mon->collect_metadata(&acked_me[mon->rank].metadata); leader_acked = -1; // bcast to everyone else @@ -117,6 +118,7 @@ void Elector::defer(int who) MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap); m->mon_features = ceph::features::mon::get_supported(); m->sharing_bl = mon->get_supported_commands_bl(); + mon->collect_metadata(&m->metadata); mon->messenger->send_message(m, mon->monmap->get_inst(who)); // set a timer @@ -184,12 +186,14 @@ void Elector::victory() uint64_t cluster_features = CEPH_FEATURES_ALL; mon_feature_t mon_features = ceph::features::mon::get_supported(); set quorum; - for (map::iterator p = acked_me.begin(); + map metadata; + for (map::iterator p = acked_me.begin(); p != acked_me.end(); ++p) { quorum.insert(p->first); cluster_features &= p->second.cluster_features; mon_features &= p->second.mon_features; + metadata[p->first] = p->second.metadata; } cancel_timer(); @@ -216,10 +220,10 @@ void Elector::victory() m->sharing_bl = *cmds_bl; mon->messenger->send_message(m, mon->monmap->get_inst(*p)); } - + // tell monitor mon->win_election(epoch, quorum, - cluster_features, mon_features, + cluster_features, mon_features, metadata, cmds, cmdsize); } @@ -331,8 +335,9 @@ void Elector::handle_ack(MonOpRequestRef op) // thanks acked_me[from].cluster_features = m->get_connection()->get_features(); acked_me[from].mon_features = m->mon_features; + acked_me[from].metadata = m->metadata; dout(5) << " so far i have {"; - for (map::const_iterator p = acked_me.begin(); + for (map::const_iterator p = acked_me.begin(); p != acked_me.end(); ++p) { if (p != acked_me.begin()) diff --git a/src/mon/Elector.h b/src/mon/Elector.h index 2e407d29058..b9e6310b5b8 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -47,9 +47,10 @@ class Elector { * mon-specific features. Instead of keeping maps to hold them both, or * a pair, which would be weird, a struct to keep them seems appropriate. */ - struct elector_features_t { + struct elector_info_t { uint64_t cluster_features; mon_feature_t mon_features; + map metadata; }; /** @@ -130,7 +131,7 @@ class Elector { * If we are acked by everyone in the MonMap, we will declare * victory. Also note each peer's feature set. */ - map acked_me; + map acked_me; /** * @} */ diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index ccf0b751d33..f70b78a7ec6 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -1870,12 +1870,16 @@ void Monitor::win_standalone_election() set q; q.insert(rank); - const MonCommand *my_cmds; - int cmdsize; + map metadata; + collect_metadata(&metadata[0]); + + const MonCommand *my_cmds = nullptr; + int cmdsize = 0; get_locally_supported_monitor_commands(&my_cmds, &cmdsize); win_election(elector.get_epoch(), q, CEPH_FEATURES_ALL, ceph::features::mon::get_supported(), + metadata, my_cmds, cmdsize); } @@ -1904,6 +1908,7 @@ void Monitor::_finish_svc_election() void Monitor::win_election(epoch_t epoch, set& active, uint64_t features, const mon_feature_t& mon_features, + const map& metadata, const MonCommand *cmdset, int cmdsize) { dout(10) << __func__ << " epoch " << epoch << " quorum " << active @@ -1917,6 +1922,7 @@ void Monitor::win_election(epoch_t epoch, set& active, uint64_t features, quorum = active; quorum_con_features = features; quorum_mon_features = mon_features; + pending_metadata = metadata; outside_quorum.clear(); clog->info() << "mon." << name << "@" << rank @@ -1936,6 +1942,27 @@ void Monitor::win_election(epoch_t epoch, set& active, uint64_t features, logger->inc(l_mon_election_win); + // inject new metadata in first transaction. + { + // include previous metadata for missing mons (that aren't part of + // the current quorum). + map m = metadata; + for (unsigned rank = 0; rank < monmap->size(); ++rank) { + if (m.count(rank) == 0 && + mon_metadata.count(rank)) { + m[rank] = mon_metadata[rank]; + } + } + + // FIXME: This is a bit sloppy because we aren't guaranteed to submit + // a new transaction immediately after the election finishes. We should + // do that anyway for other reasons, though. + MonitorDBStore::TransactionRef t = paxos->get_pending_transaction(); + bufferlist bl; + ::encode(m, bl); + t->put(MONITOR_STORE_PREFIX, "last_metadata", bl); + } + finish_election(); if (monmap->size() > 1 && monmap->get_epoch() > 0) { @@ -1944,10 +1971,6 @@ void Monitor::win_election(epoch_t epoch, set& active, uint64_t features, do_health_to_clog_interval(); scrub_event_start(); } - - Metadata my_meta; - collect_metadata(&my_meta); - update_mon_metadata(rank, std::move(my_meta)); } void Monitor::lose_election(epoch_t epoch, set &q, int l, @@ -1974,7 +1997,9 @@ void Monitor::lose_election(epoch_t epoch, set &q, int l, finish_election(); - if (quorum_con_features & CEPH_FEATURE_MON_METADATA) { + if ((quorum_con_features & CEPH_FEATURE_MON_METADATA) && + !HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS)) { + // for pre-luminous mons only Metadata sys_info; collect_metadata(&sys_info); messenger->send_message(new MMonMetadata(sys_info), @@ -4713,6 +4738,7 @@ void Monitor::handle_mon_metadata(MonOpRequestRef op) void Monitor::update_mon_metadata(int from, Metadata&& m) { + // NOTE: this is now for legacy (kraken or jewel) mons only. pending_metadata[from] = std::move(m); MonitorDBStore::TransactionRef t = paxos->get_pending_transaction(); diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index b5f87708ed1..fdc8fe6cc45 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -597,6 +597,7 @@ public: void win_election(epoch_t epoch, set& q, uint64_t features, const mon_feature_t& mon_features, + const map& metadata, const MonCommand *cmdset, int cmdsize); void lose_election(epoch_t epoch, set& q, int l, uint64_t features, -- 2.39.5