]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: use GMT time for the object name of hitsets
authorKefu Chai <kchai@redhat.com>
Fri, 5 Jun 2015 13:06:48 +0000 (21:06 +0800)
committerKefu Chai <kchai@redhat.com>
Sun, 23 Aug 2015 09:32:37 +0000 (17:32 +0800)
* bump the encoding version of pg_hit_set_info_t to 2, so we can
  tell if the corresponding hit_set is named using localtime or
  GMT
* bump the encoding version of pg_pool_t to 20, so we can know
  if a pool is using GMT to name the hit_set archive or not. and
  we can tell if current cluster allows OSDs not support GMT
  mode or not.
* add an option named `osd_pool_use_gmt_hitset`. if enabled,
  the cluster will try to use GMT mode when creating a new pool
  if all the the up OSDs support GMT mode. if any of the
  pools in the cluster is using GMT mode, then only OSDs
  supporting GMT mode are allowed to join the cluster.

Fixes: #9732
Signed-off-by: Kefu Chai <kchai@redhat.com>
src/common/config_opts.h
src/include/ceph_features.h
src/mon/OSDMonitor.cc
src/osd/ReplicatedPG.cc
src/osd/ReplicatedPG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index d30e15e3428a944daae1d99ff0e7deaa29284c02..4d25c5abe4110bd4558c1666903df82fd8b7dd5a 100644 (file)
@@ -533,6 +533,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100)              // num client messages
 OPTION(osd_pg_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_pgp_bits, OPT_INT, 6)  // bits per osd
 OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
 OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
 OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
 OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
index 0cfc20add7aeb184529fd1f453edb22a52d34792..81ff511c23250dd0aa6ad93994235c93bacbee3b 100644 (file)
@@ -65,6 +65,7 @@
 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
 #define CEPH_FEATURE_MON_METADATA (1ULL<<50)
 #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
 #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
 #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
 
index 95d57dbf5242474daea2a68bdd7a4d48c8d2f7b0..1e67c221f90dc64b9f63ce136b9bf245f13c2446 100644 (file)
@@ -16,6 +16,7 @@
  *
  */
 
+#include <algorithm>
 #include <sstream>
 #include <boost/assign.hpp>
 
@@ -1847,6 +1848,20 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
     goto ignore;
   }
 
+  if (any_of(osdmap.get_pools().begin(),
+            osdmap.get_pools().end(),
+            [](const std::pair<int64_t,pg_pool_t>& pool)
+            { return pool.second.use_gmt_hitset; })) {
+    assert(osdmap.get_num_up_osds() == 0 ||
+          osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+    if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+      dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+             << m->get_orig_source_inst()
+             << " doesn't announce support -- ignore" << dendl;
+      goto ignore;
+    }
+  }
+
   // already booted?
   if (osdmap.is_up(from) &&
       osdmap.get_inst(from) == m->get_orig_source_inst()) {
@@ -4421,7 +4436,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
   if (g_conf->osd_pool_default_flag_nosizechange)
     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
-
+  if (g_conf->osd_pool_use_gmt_hitset &&
+      (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+    pi->use_gmt_hitset = true;
   pi->size = size;
   pi->min_size = min_size;
   pi->crush_ruleset = crush_ruleset;
index 08a29c735af7af4c2a4ca50cda023444f567e8f5..eeb1f2998d548b7d1a3bfe68efe865378b835607 100644 (file)
@@ -1156,7 +1156,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
               p != info.hit_set.history.end();
               ++p) {
            if (stamp >= p->begin && stamp <= p->end) {
-             oid = get_hit_set_archive_object(p->begin, p->end);
+             oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
              break;
            }
          }
@@ -10535,10 +10535,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp)
   return hoid;
 }
 
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+                                                  utime_t end,
+                                                  bool using_gmt)
 {
   ostringstream ss;
-  ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_";
+  if (using_gmt) {
+    start.gmtime(ss) << "_";
+    end.gmtime(ss);
+  } else {
+    start.localtime(ss) << "_";
+    end.localtime(ss);
+  }
   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
                 info.pgid.ps(), info.pgid.pool(),
                 cct->_conf->osd_hit_set_namespace);
@@ -10583,7 +10592,7 @@ void ReplicatedPG::hit_set_remove_all()
   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
        p != info.hit_set.history.end();
        ++p) {
-    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     // Once we hit a degraded object just skip
     if (is_degraded_or_backfilling_object(aoid))
@@ -10595,7 +10604,7 @@ void ReplicatedPG::hit_set_remove_all()
   if (!info.hit_set.history.empty()) {
     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
     assert(p != info.hit_set.history.rend());
-    hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
     assert(!is_degraded_or_backfilling_object(oid));
     ObjectContextRef obc = get_object_context(oid, false);
     assert(obc);
@@ -10713,7 +10722,7 @@ void ReplicatedPG::hit_set_persist()
   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
        p != info.hit_set.history.end();
        ++p) {
-    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
@@ -10725,7 +10734,8 @@ void ReplicatedPG::hit_set_persist()
   utime_t start = info.hit_set.current_info.begin;
   if (!start)
      start = hit_set_start_stamp;
-  oid = get_hit_set_archive_object(start, now);
+  oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
+  // If the current object is degraded we skip this persist request
   if (scrubber.write_blocked_by_scrub(oid, get_sort_bitwise()))
     return;
 
@@ -10816,7 +10826,7 @@ void ReplicatedPG::hit_set_persist()
 
   updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
   hit_set_create();
-  updated_hit_set_hist.current_info = pg_hit_set_info_t();
+  updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
   updated_hit_set_hist.current_last_stamp = utime_t();
 
   // fabricate an object_info_t and SnapSet
@@ -10879,7 +10889,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
     assert(p != updated_hit_set_hist.history.end());
-    hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
 
     assert(!is_degraded_or_backfilling_object(oid));
 
@@ -11168,7 +11178,7 @@ void ReplicatedPG::agent_load_hit_sets()
          continue;
        }
 
-       hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+       hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
        if (is_unreadable_object(oid)) {
          dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
          break;
index ab24da4199f1c6dd11fdb5f0ff9c0cc19f56e1be..efd81d27042487f96c6d2cfd2beb83aa1f87fcb8 100644 (file)
@@ -959,7 +959,9 @@ protected:
   void hit_set_remove_all();
 
   hobject_t get_hit_set_current_object(utime_t stamp);
-  hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+  hobject_t get_hit_set_archive_object(utime_t start,
+                                      utime_t end,
+                                      bool using_gmt);
 
   // agent
   boost::scoped_ptr<TierAgentState> agent_state;
index ad68118fd41ff4d8b6335bafed0708293f9e637b..975d51391cd8e5033009eb5917d108f60993ac02 100644 (file)
@@ -942,6 +942,7 @@ void pg_pool_t::dump(Formatter *f) const
   f->close_section(); // hit_set_params
   f->dump_unsigned("hit_set_period", hit_set_period);
   f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
   f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
   f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
   f->dump_unsigned("stripe_width", get_stripe_width());
@@ -1299,6 +1300,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(expected_num_objects, bl);
   ::encode(cache_target_dirty_high_ratio_micro, bl);
   ::encode(min_write_recency_for_promote, bl);
+  ::encode(use_gmt_hitset, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -1423,8 +1425,10 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   }
   if (struct_v >= 20) {
     ::decode(min_write_recency_for_promote, bl);
+    ::decode(use_gmt_hitset, bl);
   } else {
     min_write_recency_for_promote = 1;
+    use_gmt_hitset = false;
   }
   DECODE_FINISH(bl);
   calc_pg_masks();
@@ -3931,19 +3935,25 @@ void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
 
 void pg_hit_set_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   ::encode(begin, bl);
   ::encode(end, bl);
   ::encode(version, bl);
+  ::encode(using_gmt, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   ::decode(begin, p);
   ::decode(end, p);
   ::decode(version, p);
+  if (struct_v >= 2) {
+    ::decode(using_gmt, p);
+  } else {
+    using_gmt = false;
+  }
   DECODE_FINISH(p);
 }
 
@@ -3952,6 +3962,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const
   f->dump_stream("begin") << begin;
   f->dump_stream("end") << end;
   f->dump_stream("version") << version;
+  f->dump_stream("using_gmt") << using_gmt;
 }
 
 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
index b94f547cea71d34d84c949fee1930b67c8a718de..0d1327ed4ca7d1a22cef762d60157b0cbc8bf943 100644 (file)
@@ -1111,6 +1111,7 @@ public:
   HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
   uint32_t hit_set_period;      ///< periodicity of HitSet segments (seconds)
   uint32_t hit_set_count;       ///< number of periods to retain
+  bool use_gmt_hitset;         ///< use gmt to name the hitset archive object
   uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote on read
   uint32_t min_write_recency_for_promote;  ///< minimum number of HitSet to check before promote on write
 
@@ -1141,6 +1142,7 @@ public:
       hit_set_params(),
       hit_set_period(0),
       hit_set_count(0),
+      use_gmt_hitset(true),
       min_read_recency_for_promote(0),
       min_write_recency_for_promote(0),
       stripe_width(0),
@@ -1717,10 +1719,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
 struct pg_hit_set_info_t {
   utime_t begin, end;   ///< time interval
   eversion_t version;   ///< version this HitSet object was written
-
-  pg_hit_set_info_t() {}
-  pg_hit_set_info_t(utime_t b)
-    : begin(b) {}
+  bool using_gmt;      ///< use gmt for creating the hit_set archive object name
+  pg_hit_set_info_t(bool using_gmt = true)
+    : using_gmt(using_gmt) {}
+  pg_hit_set_info_t(utime_t b, bool using_gmt)
+    : begin(b), using_gmt(using_gmt) {}
 
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);