]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon/OSDMonitor: track history and past_intervals for creating pgs
authorSage Weil <sage@redhat.com>
Fri, 19 Apr 2019 21:33:04 +0000 (16:33 -0500)
committerSage Weil <sage@redhat.com>
Mon, 22 Apr 2019 14:33:05 +0000 (09:33 -0500)
PG create messages from mons are the last case where the OSD may have to
scan an unbounded number of old maps in order to construct a valid
pg_history_t and PastIntervals.  Try to avoid making that a difficult
case by maintaining those structures on the monitor.

It is still possible that the mon may send a pg create message to the OSD
and it sits in a message queue for a very long time, but this would be a
very difficult situation to get into, and is no different from inter-OSD
messages that include history and past_intervals.

Signed-off-by: Sage Weil <sage@redhat.com>
src/messages/MOSDPGCreate2.h
src/mon/CreatingPGs.h
src/mon/OSDMonitor.cc
src/tools/ceph-dencoder/types.h
src/tools/ceph_monstore_tool.cc

index e2c6800e3b7f2137f4e0f8592450e11b37c29b0d..a1413727c969b9e684e13cb748758d08b83e22c0 100644 (file)
 
 class MOSDPGCreate2 : public Message {
 public:
-  static constexpr int HEAD_VERSION = 1;
+  static constexpr int HEAD_VERSION = 2;
   static constexpr int COMPAT_VERSION = 1;
 
   epoch_t epoch = 0;
   map<spg_t,pair<epoch_t,utime_t>> pgs;
+  map<spg_t,pair<pg_history_t,PastIntervals>> pg_extra;
 
   MOSDPGCreate2()
     : Message{MSG_OSD_PG_CREATE2, HEAD_VERSION, COMPAT_VERSION} {}
@@ -38,12 +39,16 @@ public:
     using ceph::encode;
     encode(epoch, payload);
     encode(pgs, payload);
+    encode(pg_extra, payload);
   }
   void decode_payload() override {
     auto p = payload.cbegin();
     using ceph::decode;
     decode(epoch, p);
     decode(pgs, p);
+    if (header.version >= 2) {
+      decode(pg_extra, p);
+    }
   }
 private:
   template<class T, typename... Args>
index 396ccf1cac53d5f24413ed4ab13d363723c2b996..f46143c1d5ee84f7d3732d777b2c7693b1b0db4a 100644 (file)
 struct creating_pgs_t {
   epoch_t last_scan_epoch = 0;
 
+  struct pg_create_info {
+    epoch_t create_epoch;
+    utime_t create_stamp;
+
+    // NOTE: pre-octopus instances of this class will have a
+    // zeroed-out history
+    vector<int> up;
+    int up_primary = -1;
+    vector<int> acting;
+    int acting_primary = -1;
+    pg_history_t history;
+    PastIntervals past_intervals;
+
+    void encode(bufferlist& bl, uint64_t features) const {
+      using ceph::encode;
+      if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+       // was pair<epoch_t,utime_t> prior to octopus
+       encode(create_epoch, bl);
+       encode(create_stamp, bl);
+       return;
+      }
+      ENCODE_START(1, 1, bl);
+      encode(create_epoch, bl);
+      encode(create_stamp, bl);
+      encode(up, bl);
+      encode(up_primary, bl);
+      encode(acting, bl);
+      encode(acting_primary, bl);
+      encode(history, bl);
+      encode(past_intervals, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode_legacy(bufferlist::const_iterator& p) {
+      using ceph::decode;
+      decode(create_epoch, p);
+      decode(create_stamp, p);
+    }
+    void decode(bufferlist::const_iterator& p) {
+      using ceph::decode;
+      DECODE_START(1, p);
+      decode(create_epoch, p);
+      decode(create_stamp, p);
+      decode(up, p);
+      decode(up_primary, p);
+      decode(acting, p);
+      decode(acting_primary, p);
+      decode(history, p);
+      decode(past_intervals, p);
+      DECODE_FINISH(p);
+    }
+    void dump(Formatter *f) const {
+      f->dump_unsigned("create_epoch", create_epoch);
+      f->dump_stream("create_stamp") << create_stamp;
+      f->open_array_section("up");
+      for (auto& i : up) {
+       f->dump_unsigned("osd", i);
+      }
+      f->close_section();
+      f->dump_int("up_primary", up_primary);
+      f->open_array_section("acting");
+      for (auto& i : acting) {
+       f->dump_unsigned("osd", i);
+      }
+      f->close_section();
+      f->dump_int("acting_primary", up_primary);
+      f->dump_object("pg_history", history);
+      f->dump_object("past_intervals", past_intervals);
+    }
+
+    pg_create_info() {}
+    pg_create_info(epoch_t e, utime_t t)
+      : create_epoch(e),
+       create_stamp(t) {
+      // NOTE: we don't initialize the other fields here; see
+      // OSDMonitor::update_pending_pgs()
+    }
+  };
+
   /// pgs we are currently creating
-  std::map<pg_t, std::pair<epoch_t, utime_t> > pgs;
+  std::map<pg_t, pg_create_info> pgs;
 
-  struct create_info {
+  struct pool_create_info {
     epoch_t created;
     utime_t modified;
     uint64_t start = 0;
@@ -42,7 +120,7 @@ struct creating_pgs_t {
   };
 
   /// queue of pgs we still need to create (poolid -> <created, set of ps>)
-  map<int64_t,create_info> queue;
+  map<int64_t,pool_create_info> queue;
 
   /// pools that exist in the osdmap for which at least one pg has been created
   std::set<int64_t> created_pools;
@@ -76,18 +154,34 @@ struct creating_pgs_t {
     queue.erase(removed_pool);
     return total - pgs.size();
   }
-  void encode(bufferlist& bl) const {
-    ENCODE_START(2, 1, bl);
+  void encode(bufferlist& bl, uint64_t features) const {
+    unsigned v = 3;
+    if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+      v = 2;
+    }
+    ENCODE_START(v, 1, bl);
     encode(last_scan_epoch, bl);
-    encode(pgs, bl);
+    encode(pgs, bl, features);
     encode(created_pools, bl);
     encode(queue, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(2, bl);
+    DECODE_START(3, bl);
     decode(last_scan_epoch, bl);
-    decode(pgs, bl);
+    if (struct_v >= 3) {
+      decode(pgs, bl);
+    } else {
+      // legacy pg encoding
+      pgs.clear();
+      uint32_t num;
+      decode(num, bl);
+      while (num--) {
+       pg_t pgid;
+       decode(pgid, bl);
+       pgs[pgid].decode_legacy(bl);
+      }
+    }
     decode(created_pools, bl);
     if (struct_v >= 2)
       decode(queue, bl);
@@ -99,8 +193,7 @@ struct creating_pgs_t {
     for (auto& pg : pgs) {
       f->open_object_section("pg");
       f->dump_stream("pgid") << pg.first;
-      f->dump_unsigned("epoch", pg.second.first);
-      f->dump_stream("ctime") << pg.second.second;
+      f->dump_object("pg_create_info", pg.second);
       f->close_section();
     }
     f->close_section();
@@ -124,16 +217,17 @@ struct creating_pgs_t {
   static void generate_test_instances(list<creating_pgs_t*>& o) {
     auto c = new creating_pgs_t;
     c->last_scan_epoch = 17;
-    c->pgs.emplace(pg_t{42, 2}, make_pair(31, utime_t{891, 113}));
-    c->pgs.emplace(pg_t{44, 2}, make_pair(31, utime_t{891, 113}));
+    c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113}));
+    c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113}));
     c->created_pools = {0, 1};
     o.push_back(c);
     c = new creating_pgs_t;
     c->last_scan_epoch = 18;
-    c->pgs.emplace(pg_t{42, 3}, make_pair(31, utime_t{891, 113}));
+    c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113}));
     c->created_pools = {};
     o.push_back(c);
   }
 };
-WRITE_CLASS_ENCODER(creating_pgs_t::create_info);
-WRITE_CLASS_ENCODER(creating_pgs_t);
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info)
+WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info)
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t)
index 08bffcca6403cf16749f970dee9e277dc3e5caa5..74c1b66ec32b8916a3d4a6c3cfa8e0fc955ca2af 100644 (file)
@@ -843,8 +843,10 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
       // NOTE: use the *current* epoch as the PG creation epoch so that the
       // OSD does not have to generate a long set of PastIntervals.
-      pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
-                                                   p->second.modified));
+      pending_creatings.pgs.emplace(
+       pgid,
+       creating_pgs_t::pg_create_info(inc.epoch,
+                                      p->second.modified));
       dout(10) << __func__ << " adding " << pgid << dendl;
     }
     p->second.start = end;
@@ -859,6 +861,89 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
   }
   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
           << " pools" << dendl;
+
+  if (mon->monmap->min_mon_release >= CEPH_RELEASE_OCTOPUS) {
+    // walk creating pgs' history and past_intervals forward
+    for (auto& i : pending_creatings.pgs) {
+      // this mirrors PG::start_peering_interval()
+      pg_t pgid = i.first;
+
+      // this is a bit imprecise, but sufficient?
+      struct min_size_predicate_t : public IsPGRecoverablePredicate {
+       const pg_pool_t *pi;
+       bool operator()(const set<pg_shard_t> &have) const {
+         return have.size() >= pi->min_size;
+       }
+       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+      } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
+
+      vector<int> up, acting;
+      int up_primary, acting_primary;
+      nextmap.pg_to_up_acting_osds(
+       pgid, &up, &up_primary, &acting, &acting_primary);
+      if (i.second.history.epoch_created == 0) {
+       // new pg entry, set it up
+       i.second.up = up;
+       i.second.acting = acting;
+       i.second.up_primary = up_primary;
+       i.second.acting_primary = acting_primary;
+       i.second.history = pg_history_t(i.second.create_epoch,
+                                       i.second.create_stamp);
+       dout(10) << __func__ << "  pg " << pgid << " just added, "
+                << " up " << i.second.up
+                << " p " << i.second.up_primary
+                << " acting " << i.second.acting
+                << " p " << i.second.acting_primary
+                << " history " << i.second.history
+                << " past_intervals " << i.second.past_intervals
+                << dendl;
+     } else {
+       std::stringstream debug;
+       if (PastIntervals::check_new_interval(
+             i.second.acting_primary, acting_primary,
+             i.second.acting, acting,
+             i.second.up_primary, up_primary,
+             i.second.up, up,
+             i.second.history.same_interval_since,
+             i.second.history.last_epoch_clean,
+             &nextmap,
+             &osdmap,
+             pgid,
+             &min_size_predicate,
+             &i.second.past_intervals,
+             &debug)) {
+         epoch_t e = inc.epoch;
+         i.second.history.same_interval_since = e;
+         if (i.second.up != up) {
+           i.second.history.same_up_since = e;
+         }
+         if (i.second.acting_primary != acting_primary) {
+           i.second.history.same_primary_since = e;
+         }
+         if (pgid.is_split(
+               osdmap.get_pg_num(pgid.pool()),
+               nextmap.get_pg_num(pgid.pool()),
+               nullptr)) {
+           i.second.history.last_epoch_split = e;
+         }
+         dout(10) << __func__ << "  pg " << pgid << " new interval,"
+                  << " up " << i.second.up << " -> " << up
+                  << " p " << i.second.up_primary << " -> " << up_primary
+                  << " acting " << i.second.acting << " -> " << acting
+                  << " p " << i.second.acting_primary << " -> "
+                  << acting_primary
+                  << " history " << i.second.history
+                  << " past_intervals " << i.second.past_intervals
+                  << dendl;
+         dout(20) << "  debug: " << debug.str() << dendl;
+         i.second.up = up;
+         i.second.acting = acting;
+         i.second.up_primary = up_primary;
+         i.second.acting_primary = acting_primary;
+       }
+      }
+    }
+  }
   dout(10) << __func__
           << " " << (pending_creatings.pgs.size() - total)
           << "/" << pending_creatings.pgs.size()
@@ -1102,7 +1187,13 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     // process the pool flag removal below in the same osdmap epoch.
     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
     bufferlist creatings_bl;
-    encode(pending_creatings, creatings_bl);
+    uint64_t features = CEPH_FEATURES_ALL;
+    if (mon->monmap->min_mon_release < CEPH_RELEASE_OCTOPUS) {
+      dout(20) << __func__ << " encoding pending pgs without octopus features"
+              << dendl;
+      features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+    }
+    encode(pending_creatings, creatings_bl, features);
     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
 
     // remove any old (or incompat) POOL_CREATING flags
@@ -4240,7 +4331,7 @@ void OSDMonitor::update_creating_pgs()
               << dendl;
       continue;
     }
-    auto mapped = pg.second.first;
+    auto mapped = pg.second.create_epoch;
     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
     spg_t spgid(pgid);
     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
@@ -4313,16 +4404,23 @@ epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) cons
          oldm = new MOSDPGCreate(creating_pgs_epoch);
        }
        oldm->mkpg.emplace(pg.pgid,
-                          pg_create_t{create->second.first, pg.pgid, 0});
-       oldm->ctimes.emplace(pg.pgid, create->second.second);
+                          pg_create_t{create->second.create_epoch, pg.pgid, 0});
+       oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
       } else {
        if (!m) {
          m = new MOSDPGCreate2(creating_pgs_epoch);
        }
-       m->pgs.emplace(pg, create->second);
+       m->pgs.emplace(pg, make_pair(create->second.create_epoch,
+                                    create->second.create_stamp));
+       if (create->second.history.epoch_created) {
+         dout(20) << __func__ << "   " << pg << " " << create->second.history
+                  << " " << create->second.past_intervals << dendl;
+         m->pg_extra.emplace(pg, make_pair(create->second.history,
+                                           create->second.past_intervals));
+       }
       }
       dout(20) << __func__ << " will create " << pg
-              << " at " << create->second.first << dendl;
+              << " at " << create->second.create_epoch << dendl;
     }
   }
   if (m) {
@@ -12597,9 +12695,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     bool creating_now;
     {
       std::lock_guard<std::mutex> l(creating_pgs_lock);
-      auto emplaced = creating_pgs.pgs.emplace(pgid,
-                                              make_pair(osdmap.get_epoch(),
-                                                        ceph_clock_now()));
+      auto emplaced = creating_pgs.pgs.emplace(
+       pgid,
+       creating_pgs_t::pg_create_info(osdmap.get_epoch(),
+                                      ceph_clock_now()));
       creating_now = emplaced.second;
     }
     if (creating_now) {
index 4f3206d4539c9b378f390fc897f7f663cce7c45c..d6707394b58ba6014b06b65fbd5883625872e3ce 100644 (file)
@@ -188,7 +188,7 @@ TYPE(LevelDBStoreStats)
 TYPE(ScrubResult)
 
 #include "mon/CreatingPGs.h"
-TYPE(creating_pgs_t)
+TYPE_FEATUREFUL(creating_pgs_t)
 
 #include "mgr/ServiceMap.h"
 TYPE_FEATUREFUL(ServiceMap)
index 80c3aa7538f04c921388d7f2198c676695a725a3..030f8b437a90f57608c3db711fc0400070db2f68 100644 (file)
@@ -572,7 +572,7 @@ static int update_creating_pgs(MonitorDBStore& st)
   creating.last_scan_epoch = last_osdmap_epoch;
 
   bufferlist newbl;
-  ::encode(creating, newbl);
+  encode(creating, newbl, CEPH_FEATURES_ALL);
 
   auto t = make_shared<MonitorDBStore::Transaction>();
   t->put("osd_pg_creating", "creating", newbl);