]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd,mon: keep last_epoch_started along with last_epoch_clean premerge 24566/head
authorSage Weil <sage@redhat.com>
Fri, 12 Oct 2018 21:10:45 +0000 (16:10 -0500)
committerSage Weil <sage@redhat.com>
Fri, 12 Oct 2018 21:18:22 +0000 (16:18 -0500)
We need to populate both the last_epoch_clean and last_epoch_started
accurately when fabricating a pg merge target.  In the case where the
premerge PG peers, recovers (e.g., backfill), then becomes ready to
merge, the last_epoch_clean may be > the last_epoch_started, and using
the later last_epoch_clean will prevent the PG from subsequently
peering (it'll go to incomplete state instead).

Fix by keeping both values, separately.

Signed-off-by: Sage Weil <sage@redhat.com>
src/messages/MOSDPGReadyToMerge.h
src/mon/OSDMonitor.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index c07c58f4b3ba1bc8ca7de95ceb9b903d09ff139a..937eaa6723a5dcb3befb6eddb58f2b7c251b19a2 100644 (file)
@@ -7,15 +7,17 @@ class MOSDPGReadyToMerge
   : public MessageInstance<MOSDPGReadyToMerge, PaxosServiceMessage> {
 public:
   pg_t pgid;
+  epoch_t last_epoch_started = 0;
   epoch_t last_epoch_clean = 0;
   bool ready = true;
 
   MOSDPGReadyToMerge()
     : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, 0)
   {}
-  MOSDPGReadyToMerge(pg_t p, epoch_t lec, bool r, epoch_t v)
+  MOSDPGReadyToMerge(pg_t p, epoch_t les, epoch_t lec, bool r, epoch_t v)
     : MessageInstance(MSG_OSD_PG_READY_TO_MERGE, v),
       pgid(p),
+      last_epoch_started(les),
       last_epoch_clean(lec),
       ready(r)
   {}
@@ -23,6 +25,7 @@ public:
     using ceph::encode;
     paxos_encode();
     encode(pgid, payload);
+    encode(last_epoch_started, payload);
     encode(last_epoch_clean, payload);
     encode(ready, payload);
   }
@@ -30,13 +33,15 @@ public:
     bufferlist::const_iterator p = payload.begin();
     paxos_decode(p);
     decode(pgid, p);
+    decode(last_epoch_started, p);
     decode(last_epoch_clean, p);
     decode(ready, p);
   }
   const char *get_type_name() const override { return "osd_pg_ready_to_merge"; }
   void print(ostream &out) const {
     out << get_type_name()
-        << "(" << pgid << " lec " << last_epoch_clean
+        << "(" << pgid
+       << " les/c " << last_epoch_started << "/" << last_epoch_clean
        << (ready ? " ready" : " NOT READY")
         << " v" << version << ")";
   }
index baf1dcaf38200256258c2dfb925a5af1a44ae7e3..d5f413ee317fa74b8514acc1abf315ccbb8ca1ac 100644 (file)
@@ -3296,7 +3296,7 @@ bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
   }
 
   if (m->ready) {
-    p.dec_pg_num(m->last_epoch_clean);
+    p.dec_pg_num(m->last_epoch_started, m->last_epoch_clean);
     p.last_change = pending_inc.epoch;
   } else {
     // back off the merge attempt!
index 1bc9acff69284586a5c088368e86842866757e36..6be8c5e93858965b550d44d35a2910b3f70520f3 100644 (file)
@@ -1703,11 +1703,15 @@ void OSDService::set_ready_to_merge_source(PG *pg)
   _send_ready_to_merge();
 }
 
-void OSDService::set_ready_to_merge_target(PG *pg, epoch_t last_epoch_clean)
+void OSDService::set_ready_to_merge_target(PG *pg,
+                                          epoch_t last_epoch_started,
+                                          epoch_t last_epoch_clean)
 {
   Mutex::Locker l(merge_lock);
   dout(10) << __func__ << " " << pg->pg_id << dendl;
-  ready_to_merge_target.insert(make_pair(pg->pg_id.pgid, last_epoch_clean));
+  ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
+                                        make_pair(last_epoch_started,
+                                                  last_epoch_clean)));
   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
   _send_ready_to_merge();
 }
@@ -1750,6 +1754,7 @@ void OSDService::_send_ready_to_merge()
       monc->send_mon_message(new MOSDPGReadyToMerge(
                               src,
                               0,
+                              0,
                               false,
                               osdmap->get_epoch()));
       sent_ready_to_merge_source.insert(src);
@@ -1760,6 +1765,7 @@ void OSDService::_send_ready_to_merge()
       monc->send_mon_message(new MOSDPGReadyToMerge(
                               p.second,
                               0,
+                              0,
                               false,
                               osdmap->get_epoch()));
       sent_ready_to_merge_source.insert(p.second);
@@ -1775,7 +1781,8 @@ void OSDService::_send_ready_to_merge()
        sent_ready_to_merge_source.count(src) == 0) {
       monc->send_mon_message(new MOSDPGReadyToMerge(
                               src,
-                              p->second,  // PG's last_epoch_clean
+                              p->second.first,   // PG's last_epoch_started
+                              p->second.second,  // PG's last_epoch_clean
                               true,
                               osdmap->get_epoch()));
       sent_ready_to_merge_source.insert(src);
@@ -8207,6 +8214,8 @@ bool OSD::advance_pg(
            dout(1) << __func__ << " merging " << pg->pg_id << dendl;
            pg->merge_from(
              sources, rctx, split_bits,
+             nextmap->get_pg_pool(
+               pg->pg_id.pool())->get_pg_num_dec_last_epoch_started(),
              nextmap->get_pg_pool(
                pg->pg_id.pool())->get_pg_num_dec_last_epoch_clean());
            pg->pg_slot->waiting_for_merge_epoch = 0;
index 9dfa3411c5b51ac71d0b38ed9dcb46670e094b22..067f3b65b6ef2b82d5d4302303576c331b86da8e 100644 (file)
@@ -717,13 +717,15 @@ public:
   // -- pg merge --
   Mutex merge_lock = {"OSD::merge_lock"};
   set<pg_t> ready_to_merge_source;
-  map<pg_t,epoch_t> ready_to_merge_target;  // pg -> last_epoch_clean
+  map<pg_t,pair<epoch_t,epoch_t>> ready_to_merge_target;  // pg -> (les,lec)
   set<pg_t> not_ready_to_merge_source;
   map<pg_t,pg_t> not_ready_to_merge_target;
   set<pg_t> sent_ready_to_merge_source;
 
   void set_ready_to_merge_source(PG *pg);
-  void set_ready_to_merge_target(PG *pg, epoch_t last_epoch_clean);
+  void set_ready_to_merge_target(PG *pg,
+                                epoch_t last_epoch_started,
+                                epoch_t last_epoch_clean);
   void set_not_ready_to_merge_source(pg_t source);
   void set_not_ready_to_merge_target(pg_t target, pg_t source);
   void clear_ready_to_merge(PG *pg);
index 3202abead4cb37eaf97cdd4fd499f480fe54c058..90b579a0b9f37610718b424c68812230b53872d1 100644 (file)
@@ -2358,7 +2358,9 @@ void PG::try_mark_clean()
       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
        if (target) {
          ldout(cct, 10) << "ready to merge (target)" << dendl;
-         osd->set_ready_to_merge_target(this, info.history.last_epoch_clean);
+         osd->set_ready_to_merge_target(this,
+                                        info.history.last_epoch_started,
+                                        info.history.last_epoch_clean);
        } else {
          ldout(cct, 10) << "ready to merge (source)" << dendl;
          osd->set_ready_to_merge_source(this);
@@ -2677,6 +2679,7 @@ void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transac
 
 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
                    unsigned split_bits,
+                   epoch_t dec_last_epoch_started,
                    epoch_t dec_last_epoch_clean)
 {
   dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
@@ -2769,11 +2772,12 @@ void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
 
     // use last_epoch_clean value for last_epoch_started, though--we must be
     // conservative here to avoid breaking peering, calc_acting, etc.
-    info.history.last_epoch_started = dec_last_epoch_clean;
-    info.last_epoch_started = dec_last_epoch_clean;
+    info.history.last_epoch_started = dec_last_epoch_started;
+    info.last_epoch_started = dec_last_epoch_started;
     dout(10) << __func__
-            << " set last_epoch_{started,clean} to " << dec_last_epoch_clean
-            << " from pg_num_dec_last_epoch_clean, source pg history was "
+            << " set les/c to " << dec_last_epoch_started << "/"
+            << dec_last_epoch_clean
+            << " from pool last_dec_*, source pg history was "
             << sources.begin()->second->info.history
             << dendl;
   }
index 3aa6b180ea503ee104b6899b78db0cdcf1bd5907..eacaa452c2ac99eb01866bebffebe18001c6343d 100644 (file)
@@ -400,6 +400,7 @@ public:
   void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
   void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
                  unsigned split_bits,
+                 epoch_t dec_last_epoch_started,
                  epoch_t dec_last_epoch_clean);
   void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
 
index 443bb21e055046be924c97de59c7a6c8786810e6..79b0ab8482b7d894432bcbc995ee6a0d7de73b24 100644 (file)
@@ -1231,6 +1231,8 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
   f->dump_unsigned("pg_num_target", get_pg_num_target());
   f->dump_unsigned("pg_num_pending", get_pg_num_pending());
+  f->dump_unsigned("pg_num_dec_last_epoch_started",
+                  get_pg_num_dec_last_epoch_started());
   f->dump_unsigned("pg_num_dec_last_epoch_clean",
                   get_pg_num_dec_last_epoch_clean());
   f->dump_stream("last_change") << get_last_change();
@@ -1734,6 +1736,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     encode(pg_num_target, bl);
     encode(pgp_num_target, bl);
     encode(pg_num_pending, bl);
+    encode(pg_num_dec_last_epoch_started, bl);
     encode(pg_num_dec_last_epoch_clean, bl);
     encode(last_force_op_resend, bl);
   }
@@ -1899,6 +1902,7 @@ void pg_pool_t::decode(bufferlist::const_iterator& bl)
     decode(pg_num_target, bl);
     decode(pgp_num_target, bl);
     decode(pg_num_pending, bl);
+    decode(pg_num_dec_last_epoch_started, bl);
     decode(pg_num_dec_last_epoch_clean, bl);
     decode(last_force_op_resend, bl);
   } else {
@@ -1927,7 +1931,8 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
   a.pgp_num_target = 4;
   a.pg_num_target = 5;
   a.pg_num_pending = 5;
-  a.pg_num_dec_last_epoch_clean = 2;
+  a.pg_num_dec_last_epoch_started = 2;
+  a.pg_num_dec_last_epoch_clean = 3;
   a.last_change = 9;
   a.last_force_op_resend = 123823;
   a.last_force_op_resend_preluminous = 123824;
@@ -1998,8 +2003,10 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
   }
   if (p.get_pg_num_pending() != p.get_pg_num()) {
     out << " pg_num_pending " << p.get_pg_num_pending();
-    if (p.get_pg_num_dec_last_epoch_clean())
-      out << " dlec " << p.get_pg_num_dec_last_epoch_clean();
+    if (p.get_pg_num_dec_last_epoch_started() ||
+       p.get_pg_num_dec_last_epoch_clean())
+      out << " dles/c " << p.get_pg_num_dec_last_epoch_started()
+         << "/" << p.get_pg_num_dec_last_epoch_clean();
   }
   out << " last_change " << p.get_last_change();
   if (p.get_last_force_op_resend() ||
index dd2c79aae0d141e76333360f058e1681b197c8e7..c87e3bccf46c5ea6efd020d456152bd365916e2e 100644 (file)
@@ -1361,6 +1361,8 @@ public:
   /// last epoch that forced clients to resend (pre-luminous clients only)
   epoch_t last_force_op_resend_preluminous = 0;
 
+  ///< last_epoch_started preceding pg_num decrement request
+  epoch_t pg_num_dec_last_epoch_started = 0;
   ///< last_epoch_clean preceding pg_num decrement request
   epoch_t pg_num_dec_last_epoch_clean = 0;
   snapid_t snap_seq;        ///< seq for per-pool snapshot
@@ -1618,6 +1620,9 @@ public:
   // pool size that it represents.
   unsigned get_pg_num_divisor(pg_t pgid) const;
 
+  epoch_t get_pg_num_dec_last_epoch_started() const {
+    return pg_num_dec_last_epoch_started;
+  }
   epoch_t get_pg_num_dec_last_epoch_clean() const {
     return pg_num_dec_last_epoch_clean;
   }
@@ -1643,8 +1648,10 @@ public:
   void set_pgp_num_target(int p) {
     pgp_num_target = p;
   }
-  void dec_pg_num(epoch_t last_epoch_clean) {
+  void dec_pg_num(epoch_t last_epoch_started,
+                 epoch_t last_epoch_clean) {
     --pg_num;
+    pg_num_dec_last_epoch_started = last_epoch_started;
     pg_num_dec_last_epoch_clean = last_epoch_clean;
     calc_pg_masks();
   }