]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: use GMT time for the object name of hitsets
authorKefu Chai <kchai@redhat.com>
Fri, 5 Jun 2015 13:06:48 +0000 (21:06 +0800)
committerKefu Chai <kchai@redhat.com>
Thu, 24 Mar 2016 11:54:50 +0000 (19:54 +0800)
* bump the encoding version of pg_hit_set_info_t to 2, so we can
  tell if the corresponding hit_set is named using localtime or
  GMT
* bump the encoding version of pg_pool_t to 20, so we can know
  if a pool is using GMT to name the hit_set archive or not. and
  we can tell if current cluster allows OSDs not support GMT
  mode or not.
* add an option named `osd_pool_use_gmt_hitset`. if enabled,
  the cluster will try to use GMT mode when creating a new pool
  if all the the up OSDs support GMT mode. if any of the
  pools in the cluster is using GMT mode, then only OSDs
  supporting GMT mode are allowed to join the cluster.

Fixes: #9732
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit 42f8c5daad16aa849a0b99871d50161673c0c370)

 Conflicts:
src/include/ceph_features.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.cc
src/osd/osd_types.h
fill pg_pool_t with default settings in master branch.

src/common/config_opts.h
src/include/ceph_features.h
src/mon/OSDMonitor.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index 76763e4ee9bde39545684088e4f1ee67a9e6bc38..99211701f6e2ca5d92e02630ec031d80d02826f8 100644 (file)
@@ -497,6 +497,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100)              // num client messages
 OPTION(osd_pg_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_pgp_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
 OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
 OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
 OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
index 781df1b3003b1d15c6969de158448ff891b6f382..a5d9fc1f16f7287be4880e88daf7710932f21640 100644 (file)
@@ -64,6 +64,7 @@
 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
 #define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
 /* ... */
 #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
 
index 968efc43759836590ce2b5eb70ef34e63583c8ad..23fda28fcf3751b20f4cbf9a4f5ff6c477862cf3 100644 (file)
@@ -16,6 +16,7 @@
  * 
  */
 
+#include <algorithm>
 #include <sstream>
 
 #include "OSDMonitor.h"
@@ -1565,6 +1566,9 @@ void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
   failure_info.clear();
 }
 
+static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
+  return pool.second.use_gmt_hitset;
+}
 
 // boot --
 
@@ -1634,6 +1638,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
     }
   }
 
+  if (std::find_if(osdmap.get_pools().begin(),
+                  osdmap.get_pools().end(),
+                  uses_gmt_hitset) != osdmap.get_pools().end()) {
+    assert(osdmap.get_num_up_osds() == 0 ||
+          osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+    if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+             << m->get_orig_source_inst()
+             << " doesn't announce support -- ignore" << dendl;
+      goto ignore;
+    }
+  }
+
   // already booted?
   if (osdmap.is_up(from) &&
       osdmap.get_inst(from) == m->get_orig_source_inst()) {
@@ -4042,7 +4059,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
   if (g_conf->osd_pool_default_flag_nosizechange)
     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
-
+  if (g_conf->osd_pool_use_gmt_hitset &&
+      (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+    pi->use_gmt_hitset = true;
   pi->size = size;
   pi->min_size = min_size;
   pi->crush_ruleset = crush_ruleset;
index 3c6511f66bb2af116cdf66134ff085c3331e8333..f988f34ee0761ff23c3131ef90d92e5daad36655 100644 (file)
@@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
               p != info.hit_set.history.end();
               ++p) {
            if (stamp >= p->begin && stamp <= p->end) {
-             oid = get_hit_set_archive_object(p->begin, p->end);
+             oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
              break;
            }
          }
@@ -10167,10 +10167,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
   return hoid;
 }
 
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+                                                  utime_t end,
+                                                  bool using_gmt)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_";
+  if (using_gmt) {
+    start.gmtime(ss) << "_";
+    end.gmtime(ss);
+  } else {
+    start.localtime(ss) << "_";
+    end.localtime(ss);
+  }
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
                 info.pgid.ps(), info.pgid.pool(),
                 cct->_conf->osd_hit_set_namespace);
@@ -10307,7 +10316,7 @@ void ReplicatedPG::hit_set_persist()
   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
        p != info.hit_set.history.end();
        ++p) {
-    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
@@ -10316,10 +10325,8 @@ void ReplicatedPG::hit_set_persist()
       return;
   }
 
-  oid = get_hit_set_archive_object(start, now);
+  oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
   // If the current object is degraded we skip this persist request
-  if (is_degraded_or_backfilling_object(oid))
-    return;
   if (scrubber.write_blocked_by_scrub(oid))
     return;
 
@@ -10410,7 +10417,7 @@ void ReplicatedPG::hit_set_persist()
 
   updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
   hit_set_create();
-  updated_hit_set_hist.current_info = pg_hit_set_info_t();
+  updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
   updated_hit_set_hist.current_last_stamp = utime_t();
 
   // fabricate an object_info_t and SnapSet
@@ -10473,7 +10480,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
     assert(p != updated_hit_set_hist.history.end());
-    hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     assert(!is_degraded_or_backfilling_object(oid));
 
@@ -10758,7 +10765,7 @@ void ReplicatedPG::agent_load_hit_sets()
          continue;
        }
 
-       hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+       hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
        if (is_unreadable_object(oid)) {
          dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
          break;
index c8ed4fc2c909b92e3d19f15135c3effc629799c3..0894be691c2645b5fbcb75b3794201463d6f72e8 100644 (file)
@@ -903,7 +903,9 @@ protected:
   void hit_set_in_memory_trim();                     ///< discard old in memory HitSets
 
   hobject_t get_hit_set_current_object(utime_t stamp);
-  hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+  hobject_t get_hit_set_archive_object(utime_t start,
+                                      utime_t end,
+                                      bool using_gmt);
 
   // agent
   boost::scoped_ptr<TierAgentState> agent_state;
index 292fc94df05ec431eaa5564d237cf128055fd145..20127c87d33ecce48ca3895d6c2f17a2af3355ba 100644 (file)
@@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const
   f->close_section(); // hit_set_params
   f->dump_unsigned("hit_set_period", hit_set_period);
   f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
   f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
   f->dump_unsigned("stripe_width", get_stripe_width());
   f->dump_unsigned("expected_num_objects", expected_num_objects);
@@ -1280,6 +1281,9 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(last_force_op_resend, bl);
   ::encode(min_read_recency_for_promote, bl);
   ::encode(expected_num_objects, bl);
+  ::encode(uint32_t(.6 * 1e6), bl);
+  ::encode(uint32_t(1), bl);
+  ::encode(use_gmt_hitset, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -1397,6 +1401,17 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   } else {
     expected_num_objects = 0;
   }
+  if (struct_v >= 19) {
+    uint32_t dummy;        
+    ::decode(dummy, bl);
+  }
+  if (struct_v >= 20) {
+    uint32_t dummy;
+    ::decode(dummy, bl);
+    ::decode(use_gmt_hitset, bl);
+  } else {
+    use_gmt_hitset = false;
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
 }
@@ -3795,19 +3810,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
 
 void pg_hit_set_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(begin, bl);
   ::encode(end, bl);
   ::encode(version, bl);
+  ::encode(using_gmt, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(begin, p);
   ::decode(end, p);
   ::decode(version, p);
+  if (struct_v >= 2) {
+    ::decode(using_gmt, p);
+  } else {
+    using_gmt = false;
+  }
   DECODE_FINISH(p);
 }
 
@@ -3816,6 +3837,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
   f->dump_stream("begin") << begin;
   f->dump_stream("end") << end;
   f->dump_stream("version") << version;
+  f->dump_stream("using_gmt") << using_gmt;
 }
 
 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
index 7557494a0af4a700f9bf0e7210cea2d5d20ebd8a..92f616324335aba08a7450e5fb14848c5e6bb54a 100644 (file)
@@ -1035,6 +1035,7 @@ public:
   HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
   uint32_t hit_set_period;      ///< periodicity of HitSet segments (seconds)
   uint32_t hit_set_count;       ///< number of periods to retain
+  bool use_gmt_hitset;         ///< use gmt to name the hitset archive object
   uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote
 
   uint32_t stripe_width;        ///< erasure coded stripe size in bytes
@@ -1063,6 +1064,7 @@ public:
       hit_set_params(),
       hit_set_period(0),
       hit_set_count(0),
+      use_gmt_hitset(true),
       min_read_recency_for_promote(0),
       stripe_width(0),
       expected_num_objects(0)
@@ -1600,10 +1602,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
 struct pg_hit_set_info_t {
   utime_t begin, end;   ///< time interval
   eversion_t version;   ///< version this HitSet object was written
-
-  pg_hit_set_info_t() {}
-  pg_hit_set_info_t(utime_t b)
-    : begin(b) {}
+  bool using_gmt;      ///< use gmt for creating the hit_set archive object name
+  pg_hit_set_info_t(bool using_gmt = true)
+    : using_gmt(using_gmt) {}
+  pg_hit_set_info_t(utime_t b, bool using_gmt)
+    : begin(b), using_gmt(using_gmt) {}
 
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);