]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: use GMT time for the object name of hitsets
authorKefu Chai <kchai@redhat.com>
Fri, 5 Jun 2015 13:06:48 +0000 (21:06 +0800)
committerKefu Chai <kchai@redhat.com>
Fri, 9 Oct 2015 10:20:31 +0000 (03:20 -0700)
* bump the encoding version of pg_hit_set_info_t to 2, so we can
  tell if the corresponding hit_set is named using localtime or
  GMT
* bump the encoding version of pg_pool_t to 20, so we can know
  if a pool is using GMT to name the hit_set archive or not. and
  we can tell if current cluster allows OSDs not support GMT
  mode or not.
* add an option named `osd_pool_use_gmt_hitset`. if enabled,
  the cluster will try to use GMT mode when creating a new pool
  if all the the up OSDs support GMT mode. if any of the
  pools in the cluster is using GMT mode, then only OSDs
  supporting GMT mode are allowed to join the cluster.

Fixes: #9732
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit 42f8c5daad16aa849a0b99871d50161673c0c370)

 Conflicts:
src/include/ceph_features.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.cc
src/osd/osd_types.h
fill pg_pool_t with default settings in master branch.

src/common/config_opts.h
src/include/ceph_features.h
src/mon/OSDMonitor.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index 92b02d3f698976207b791e88808806cc9451d4b2..c6ca69b4a358cdea3665b0a1b6ce4a6a518c2e72 100644 (file)
@@ -497,6 +497,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100)              // num client messages
 OPTION(osd_pg_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_pgp_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
 OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
 OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
 OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
index 781df1b3003b1d15c6969de158448ff891b6f382..a5d9fc1f16f7287be4880e88daf7710932f21640 100644 (file)
@@ -64,6 +64,7 @@
 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
 #define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
 /* ... */
 #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
 
index cdbb6c752e9e2eafac585fc6c394602fe4ed2072..4b9fddaab5d39584e7198c89c634ae9c571ff80f 100644 (file)
@@ -16,6 +16,7 @@
  * 
  */
 
+#include <algorithm>
 #include <sstream>
 
 #include "OSDMonitor.h"
@@ -1572,6 +1573,9 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
   failure_info.clear();
 }
 
+static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
+  return pool.second.use_gmt_hitset;
+}
 
 // boot --
 
@@ -1641,6 +1645,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
     }
   }
 
+  if (std::find_if(osdmap.get_pools().begin(),
+                  osdmap.get_pools().end(),
+                  uses_gmt_hitset) != osdmap.get_pools().end()) {
+    assert(osdmap.get_num_up_osds() == 0 ||
+          osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+    if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+             << m->get_orig_source_inst()
+             << " doesn't announce support -- ignore" << dendl;
+      goto ignore;
+    }
+  }
+
   // already booted?
   if (osdmap.is_up(from) &&
       osdmap.get_inst(from) == m->get_orig_source_inst()) {
@@ -4075,7 +4092,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
   if (g_conf->osd_pool_default_flag_nosizechange)
     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
-
+  if (g_conf->osd_pool_use_gmt_hitset &&
+      (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+    pi->use_gmt_hitset = true;
   pi->size = size;
   pi->min_size = min_size;
   pi->crush_ruleset = crush_ruleset;
index e1d0acf50892893e6ae3f56fb8b97b5e7d685993..ca3d55e63032841230e703d8d143332bf68fffda 100644 (file)
@@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
               p != info.hit_set.history.end();
               ++p) {
            if (stamp >= p->begin && stamp <= p->end) {
-             oid = get_hit_set_archive_object(p->begin, p->end);
+             oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
              break;
            }
          }
@@ -10110,10 +10110,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
   return hoid;
 }
 
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+                                                  utime_t end,
+                                                  bool using_gmt)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_";
+  if (using_gmt) {
+    start.gmtime(ss) << "_";
+    end.gmtime(ss);
+  } else {
+    start.localtime(ss) << "_";
+    end.localtime(ss);
+  }
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
                 info.pgid.ps(), info.pgid.pool(),
                 cct->_conf->osd_hit_set_namespace);
@@ -10250,7 +10259,7 @@ void ReplicatedPG::hit_set_persist()
   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
        p != info.hit_set.history.end();
        ++p) {
-    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
@@ -10259,10 +10268,8 @@ void ReplicatedPG::hit_set_persist()
       return;
   }
 
-  oid = get_hit_set_archive_object(start, now);
+  oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
   // If the current object is degraded we skip this persist request
-  if (is_degraded_or_backfilling_object(oid))
-    return;
   if (scrubber.write_blocked_by_scrub(oid))
     return;
 
@@ -10353,7 +10360,7 @@ void ReplicatedPG::hit_set_persist()
 
   updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
   hit_set_create();
-  updated_hit_set_hist.current_info = pg_hit_set_info_t();
+  updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
   updated_hit_set_hist.current_last_stamp = utime_t();
 
   // fabricate an object_info_t and SnapSet
@@ -10416,7 +10423,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
     assert(p != updated_hit_set_hist.history.end());
-    hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     assert(!is_degraded_or_backfilling_object(oid));
 
@@ -10701,7 +10708,7 @@ void ReplicatedPG::agent_load_hit_sets()
          continue;
        }
 
-       hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+       hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
        if (is_unreadable_object(oid)) {
          dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
          break;
index 48e0def334ef8893ec32caddc125f796e384f0f7..d09dba32a672a141f0345f4fc7f44c2556b16a52 100644 (file)
@@ -903,7 +903,9 @@ protected:
   void hit_set_in_memory_trim();                     ///< discard old in memory HitSets
 
   hobject_t get_hit_set_current_object(utime_t stamp);
-  hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+  hobject_t get_hit_set_archive_object(utime_t start,
+                                      utime_t end,
+                                      bool using_gmt);
 
   // agent
   boost::scoped_ptr<TierAgentState> agent_state;
index b2bea5b7204e4b93c48256c2039e823e0333bb4b..7253160014d301b7c309c9420e331da699f438b9 100644 (file)
@@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const
   f->close_section(); // hit_set_params
   f->dump_unsigned("hit_set_period", hit_set_period);
   f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
   f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
   f->dump_unsigned("stripe_width", get_stripe_width());
   f->dump_unsigned("expected_num_objects", expected_num_objects);
@@ -1280,6 +1281,9 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(last_force_op_resend, bl);
   ::encode(min_read_recency_for_promote, bl);
   ::encode(expected_num_objects, bl);
+  ::encode(uint32_t(.6 * 1e6), bl);
+  ::encode(uint32_t(1), bl);
+  ::encode(use_gmt_hitset, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -1397,6 +1401,17 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   } else {
     expected_num_objects = 0;
   }
+  if (struct_v >= 19) {
+    uint32_t dummy;        
+    ::decode(dummy, bl);
+  }
+  if (struct_v >= 20) {
+    uint32_t dummy;
+    ::decode(dummy, bl);
+    ::decode(use_gmt_hitset, bl);
+  } else {
+    use_gmt_hitset = false;
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
 }
@@ -3789,19 +3804,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
 
 void pg_hit_set_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(begin, bl);
   ::encode(end, bl);
   ::encode(version, bl);
+  ::encode(using_gmt, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(begin, p);
   ::decode(end, p);
   ::decode(version, p);
+  if (struct_v >= 2) {
+    ::decode(using_gmt, p);
+  } else {
+    using_gmt = false;
+  }
   DECODE_FINISH(p);
 }
 
@@ -3810,6 +3831,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
   f->dump_stream("begin") << begin;
   f->dump_stream("end") << end;
   f->dump_stream("version") << version;
+  f->dump_stream("using_gmt") << using_gmt;
 }
 
 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
index b9b3b8150ab97759219199875072505dacb0defe..ed6bbfc24b7db94871463fd83d2bcbeca93c5c3c 100644 (file)
@@ -1035,6 +1035,7 @@ public:
   HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
   uint32_t hit_set_period;      ///< periodicity of HitSet segments (seconds)
   uint32_t hit_set_count;       ///< number of periods to retain
+  bool use_gmt_hitset;         ///< use gmt to name the hitset archive object
   uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote
 
   uint32_t stripe_width;        ///< erasure coded stripe size in bytes
@@ -1063,6 +1064,7 @@ public:
       hit_set_params(),
       hit_set_period(0),
       hit_set_count(0),
+      use_gmt_hitset(true),
       min_read_recency_for_promote(0),
       stripe_width(0),
       expected_num_objects(0)
@@ -1600,10 +1602,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
 struct pg_hit_set_info_t {
   utime_t begin, end;   ///< time interval
   eversion_t version;   ///< version this HitSet object was written
-
-  pg_hit_set_info_t() {}
-  pg_hit_set_info_t(utime_t b)
-    : begin(b) {}
+  bool using_gmt;      ///< use gmt for creating the hit_set archive object name
+  pg_hit_set_info_t(bool using_gmt = true)
+    : using_gmt(using_gmt) {}
+  pg_hit_set_info_t(utime_t b, bool using_gmt)
+    : begin(b), using_gmt(using_gmt) {}
 
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);