]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: collect mon metadata as part of election 16148/head
authorSage Weil <sage@redhat.com>
Thu, 6 Jul 2017 19:32:20 +0000 (15:32 -0400)
committerSage Weil <sage@redhat.com>
Fri, 7 Jul 2017 15:34:47 +0000 (11:34 -0400)
Previously each peon would send a message to teh leader with its metadata
immediately after the election was won.  However, at that point paxos
usually wasn't writeable, which meant the old update_mon_metadata() method
didn't persist reliably, updates would race, and generally speaking
metadata wasn't reliably updated.

Fix this by including metadata as part of the election ack, and persisting
the whole quorum when the election is won.  This ensures it is up to date.

Fixes: http://tracker.ceph.com/issues/20434
Signed-off-by: Sage Weil <sage@redhat.com>
src/messages/MMonElection.h
src/mon/Elector.cc
src/mon/Elector.h
src/mon/Monitor.cc
src/mon/Monitor.h

index 79503875e264490f3b08152b0ba7d5a1dff43d38..c9b87c451ecd9320e9f8a0e2cd20a72e22ff0772 100644 (file)
@@ -22,7 +22,7 @@
 
 class MMonElection : public Message {
 
-  static const int HEAD_VERSION = 6;
+  static const int HEAD_VERSION = 7;
   static const int COMPAT_VERSION = 5;
 
 public:
@@ -48,25 +48,19 @@ public:
   uint64_t quorum_features;
   mon_feature_t mon_features;
   bufferlist sharing_bl;
-  /* the following were both used in the next branch for a while
-   * on user cluster, so we've left them in for compatibility. */
-  version_t defunct_one;
-  version_t defunct_two;
+  map<string,string> metadata;
   
   MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
     op(0), epoch(0),
     quorum_features(0),
-    mon_features(0),
-    defunct_one(0),
-    defunct_two(0)
+    mon_features(0)
   { }
 
   MMonElection(int o, epoch_t e, MonMap *m)
     : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
       fsid(m->fsid), op(o), epoch(e),
       quorum_features(0),
-      mon_features(0),
-      defunct_one(0), defunct_two(0)
+      mon_features(0)
   {
     // encode using full feature set; we will reencode for dest later,
     // if necessary
@@ -96,10 +90,11 @@ public:
     ::encode(monmap_bl, payload);
     ::encode(quorum, payload);
     ::encode(quorum_features, payload);
-    ::encode(defunct_one, payload);
-    ::encode(defunct_two, payload);
+    ::encode((version_t)0, payload);  // defunct
+    ::encode((version_t)0, payload);  // defunct
     ::encode(sharing_bl, payload);
     ::encode(mon_features, payload);
+    ::encode(metadata, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -109,11 +104,16 @@ public:
     ::decode(monmap_bl, p);
     ::decode(quorum, p);
     ::decode(quorum_features, p);
-    ::decode(defunct_one, p);
-    ::decode(defunct_two, p);
+    {
+      version_t v;  // defunct fields from old encoding
+      ::decode(v, p);
+      ::decode(v, p);
+    }
     ::decode(sharing_bl, p);
     if (header.version >= 6)
       ::decode(mon_features, p);
+    if (header.version >= 7)
+      ::decode(metadata, p);
   }
   
 };
index 303510530b5c223709a69decb79c1dcf9f462d7b..a2244a3c6da04895d354c376b08e0b107de305a3 100644 (file)
@@ -87,6 +87,7 @@ void Elector::start()
   electing_me = true;
   acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL;
   acked_me[mon->rank].mon_features = ceph::features::mon::get_supported();
+  mon->collect_metadata(&acked_me[mon->rank].metadata);
   leader_acked = -1;
 
   // bcast to everyone else
@@ -117,6 +118,7 @@ void Elector::defer(int who)
   MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
   m->mon_features = ceph::features::mon::get_supported();
   m->sharing_bl = mon->get_supported_commands_bl();
+  mon->collect_metadata(&m->metadata);
   mon->messenger->send_message(m, mon->monmap->get_inst(who));
   
   // set a timer
@@ -184,12 +186,14 @@ void Elector::victory()
   uint64_t cluster_features = CEPH_FEATURES_ALL;
   mon_feature_t mon_features = ceph::features::mon::get_supported();
   set<int> quorum;
-  for (map<int, elector_features_t>::iterator p = acked_me.begin();
+  map<int,Metadata> metadata;
+  for (map<int, elector_info_t>::iterator p = acked_me.begin();
        p != acked_me.end();
        ++p) {
     quorum.insert(p->first);
     cluster_features &= p->second.cluster_features;
     mon_features &= p->second.mon_features;
+    metadata[p->first] = p->second.metadata;
   }
 
   cancel_timer();
@@ -216,10 +220,10 @@ void Elector::victory()
     m->sharing_bl = *cmds_bl;
     mon->messenger->send_message(m, mon->monmap->get_inst(*p));
   }
-    
+
   // tell monitor
   mon->win_election(epoch, quorum,
-                    cluster_features, mon_features,
+                    cluster_features, mon_features, metadata,
                     cmds, cmdsize);
 }
 
@@ -331,8 +335,9 @@ void Elector::handle_ack(MonOpRequestRef op)
     // thanks
     acked_me[from].cluster_features = m->get_connection()->get_features();
     acked_me[from].mon_features = m->mon_features;
+    acked_me[from].metadata = m->metadata;
     dout(5) << " so far i have {";
-    for (map<int, elector_features_t>::const_iterator p = acked_me.begin();
+    for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
          p != acked_me.end();
          ++p) {
       if (p != acked_me.begin())
index 2e407d29058258c92b3223a62ed40257b99275e3..b9e6310b5b881e0103a0669fefc27d702a1b7e8d 100644 (file)
@@ -47,9 +47,10 @@ class Elector {
    * mon-specific features. Instead of keeping maps to hold them both, or
    * a pair, which would be weird, a struct to keep them seems appropriate.
    */
-  struct elector_features_t {
+  struct elector_info_t {
     uint64_t cluster_features;
     mon_feature_t mon_features;
+    map<string,string> metadata;
   };
 
   /**
@@ -130,7 +131,7 @@ class Elector {
    * If we are acked by everyone in the MonMap, we will declare
    * victory.  Also note each peer's feature set.
    */
-  map<int, elector_features_t> acked_me;
+  map<int, elector_info_t> acked_me;
   /**
    * @}
    */
index ccf0b751d33287e38706fa2e8b0cc4db2dec1d54..f70b78a7ec65a7f7bf251adecad8ac98d919a4c2 100644 (file)
@@ -1870,12 +1870,16 @@ void Monitor::win_standalone_election()
   set<int> q;
   q.insert(rank);
 
-  const MonCommand *my_cmds;
-  int cmdsize;
+  map<int,Metadata> metadata;
+  collect_metadata(&metadata[0]);
+
+  const MonCommand *my_cmds = nullptr;
+  int cmdsize = 0;
   get_locally_supported_monitor_commands(&my_cmds, &cmdsize);
   win_election(elector.get_epoch(), q,
                CEPH_FEATURES_ALL,
                ceph::features::mon::get_supported(),
+              metadata,
                my_cmds, cmdsize);
 }
 
@@ -1904,6 +1908,7 @@ void Monitor::_finish_svc_election()
 
 void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
                            const mon_feature_t& mon_features,
+                          const map<int,Metadata>& metadata,
                            const MonCommand *cmdset, int cmdsize)
 {
   dout(10) << __func__ << " epoch " << epoch << " quorum " << active
@@ -1917,6 +1922,7 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
   quorum = active;
   quorum_con_features = features;
   quorum_mon_features = mon_features;
+  pending_metadata = metadata;
   outside_quorum.clear();
 
   clog->info() << "mon." << name << "@" << rank
@@ -1936,6 +1942,27 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
 
   logger->inc(l_mon_election_win);
 
+  // inject new metadata in first transaction.
+  {
+    // include previous metadata for missing mons (that aren't part of
+    // the current quorum).
+    map<int,Metadata> m = metadata;
+    for (unsigned rank = 0; rank < monmap->size(); ++rank) {
+      if (m.count(rank) == 0 &&
+         mon_metadata.count(rank)) {
+       m[rank] = mon_metadata[rank];
+      }
+    }
+
+    // FIXME: This is a bit sloppy because we aren't guaranteed to submit
+    // a new transaction immediately after the election finishes.  We should
+    // do that anyway for other reasons, though.
+    MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
+    bufferlist bl;
+    ::encode(m, bl);
+    t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
+  }
+
   finish_election();
   if (monmap->size() > 1 &&
       monmap->get_epoch() > 0) {
@@ -1944,10 +1971,6 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,
     do_health_to_clog_interval();
     scrub_event_start();
   }
-
-  Metadata my_meta;
-  collect_metadata(&my_meta);
-  update_mon_metadata(rank, std::move(my_meta));
 }
 
 void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
@@ -1974,7 +1997,9 @@ void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
 
   finish_election();
 
-  if (quorum_con_features & CEPH_FEATURE_MON_METADATA) {
+  if ((quorum_con_features & CEPH_FEATURE_MON_METADATA) &&
+      !HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS)) {
+    // for pre-luminous mons only
     Metadata sys_info;
     collect_metadata(&sys_info);
     messenger->send_message(new MMonMetadata(sys_info),
@@ -4713,6 +4738,7 @@ void Monitor::handle_mon_metadata(MonOpRequestRef op)
 
 void Monitor::update_mon_metadata(int from, Metadata&& m)
 {
+  // NOTE: this is now for legacy (kraken or jewel) mons only.
   pending_metadata[from] = std::move(m);
 
   MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
index b5f87708ed1be4d21fa53b59f3d187c7bfd170ca..fdc8fe6cc459f4b81e381596419444c1285806b9 100644 (file)
@@ -597,6 +597,7 @@ public:
   void win_election(epoch_t epoch, set<int>& q,
                    uint64_t features,
                     const mon_feature_t& mon_features,
+                   const map<int,Metadata>& metadata,
                    const MonCommand *cmdset, int cmdsize);
   void lose_election(epoch_t epoch, set<int>& q, int l,
                     uint64_t features,