From: Kefu Chai Date: Fri, 5 Jun 2015 13:06:48 +0000 (+0800) Subject: osd: use GMT time for the object name of hitsets X-Git-Tag: v0.94.6~84^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=040e390d29fc68dcefe48c59cccacf6258c6f690;p=ceph.git osd: use GMT time for the object name of hitsets * bump the encoding version of pg_hit_set_info_t to 2, so we can tell if the corresponding hit_set is named using localtime or GMT * bump the encoding version of pg_pool_t to 20, so we can know if a pool is using GMT to name the hit_set archive or not. and we can tell if current cluster allows OSDs not support GMT mode or not. * add an option named `osd_pool_use_gmt_hitset`. if enabled, the cluster will try to use GMT mode when creating a new pool if all the the up OSDs support GMT mode. if any of the pools in the cluster is using GMT mode, then only OSDs supporting GMT mode are allowed to join the cluster. Fixes: #9732 Signed-off-by: Kefu Chai (cherry picked from commit 42f8c5daad16aa849a0b99871d50161673c0c370) Conflicts: src/include/ceph_features.h src/osd/ReplicatedPG.cc src/osd/osd_types.cc src/osd/osd_types.h fill pg_pool_t with default settings in master branch. --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 92b02d3f698..c6ca69b4a35 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -497,6 +497,7 @@ OPTION(osd_client_message_cap, OPT_U64, 100) // num client messages OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host +OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it. OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 781df1b3003..a5d9fc1f16f 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -64,6 +64,7 @@ // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ #define CEPH_FEATURE_MON_METADATA (1ULL<<50) +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */ /* ... */ #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index cdbb6c752e9..4b9fddaab5d 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -16,6 +16,7 @@ * */ +#include #include #include "OSDMonitor.h" @@ -1572,6 +1573,9 @@ void OSDMonitor::take_all_failures(list& ls) failure_info.clear(); } +static bool uses_gmt_hitset(const std::pair& pool) { + return pool.second.use_gmt_hitset; +} // boot -- @@ -1641,6 +1645,19 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) } } + if (std::find_if(osdmap.get_pools().begin(), + osdmap.get_pools().end(), + uses_gmt_hitset) != osdmap.get_pools().end()) { + assert(osdmap.get_num_up_osds() == 0 || + osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT); + if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) { + dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at " + << m->get_orig_source_inst() + << " doesn't announce support -- ignore" << dendl; + goto ignore; + } + } + // already booted? if (osdmap.is_up(from) && osdmap.get_inst(from) == m->get_orig_source_inst()) { @@ -4075,7 +4092,9 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE); if (g_conf->osd_pool_default_flag_nosizechange) pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE); - + if (g_conf->osd_pool_use_gmt_hitset && + (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) + pi->use_gmt_hitset = true; pi->size = size; pi->min_size = min_size; pi->crush_ruleset = crush_ruleset; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index e1d0acf5089..ca3d55e6303 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1135,7 +1135,7 @@ void ReplicatedPG::do_pg_op(OpRequestRef op) p != info.hit_set.history.end(); ++p) { if (stamp >= p->begin && stamp <= p->end) { - oid = get_hit_set_archive_object(p->begin, p->end); + oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); break; } } @@ -10110,10 +10110,19 @@ hobject_t ReplicatedPG::get_hit_set_current_object(utime_t stamp) return hoid; } -hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end) +hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt) { ostringstream ss; - ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end; + ss << "hit_set_" << info.pgid.pgid << "_archive_"; + if (using_gmt) { + start.gmtime(ss) << "_"; + end.gmtime(ss); + } else { + start.localtime(ss) << "_"; + end.localtime(ss); + } hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "", info.pgid.ps(), info.pgid.pool(), cct->_conf->osd_hit_set_namespace); @@ -10250,7 +10259,7 @@ void ReplicatedPG::hit_set_persist() for (list::iterator p = info.hit_set.history.begin(); p != info.hit_set.history.end(); ++p) { - hobject_t aoid = get_hit_set_archive_object(p->begin, p->end); + hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); // Once we hit a degraded object just skip further trim if (is_degraded_or_backfilling_object(aoid)) @@ -10259,10 +10268,8 @@ void ReplicatedPG::hit_set_persist() return; } - oid = get_hit_set_archive_object(start, now); + oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset); // If the current object is degraded we skip this persist request - if (is_degraded_or_backfilling_object(oid)) - return; if (scrubber.write_blocked_by_scrub(oid)) return; @@ -10353,7 +10360,7 @@ void ReplicatedPG::hit_set_persist() updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info); hit_set_create(); - updated_hit_set_hist.current_info = pg_hit_set_info_t(); + updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset); updated_hit_set_hist.current_last_stamp = utime_t(); // fabricate an object_info_t and SnapSet @@ -10416,7 +10423,7 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) { list::iterator p = updated_hit_set_hist.history.begin(); assert(p != updated_hit_set_hist.history.end()); - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); assert(!is_degraded_or_backfilling_object(oid)); @@ -10701,7 +10708,7 @@ void ReplicatedPG::agent_load_hit_sets() continue; } - hobject_t oid = get_hit_set_archive_object(p->begin, p->end); + hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt); if (is_unreadable_object(oid)) { dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl; break; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 48e0def334e..d09dba32a67 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -903,7 +903,9 @@ protected: void hit_set_in_memory_trim(); ///< discard old in memory HitSets hobject_t get_hit_set_current_object(utime_t stamp); - hobject_t get_hit_set_archive_object(utime_t start, utime_t end); + hobject_t get_hit_set_archive_object(utime_t start, + utime_t end, + bool using_gmt); // agent boost::scoped_ptr agent_state; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index b2bea5b7204..7253160014d 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -926,6 +926,7 @@ void pg_pool_t::dump(Formatter *f) const f->close_section(); // hit_set_params f->dump_unsigned("hit_set_period", hit_set_period); f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_bool("use_gmt_hitset", use_gmt_hitset); f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); f->dump_unsigned("stripe_width", get_stripe_width()); f->dump_unsigned("expected_num_objects", expected_num_objects); @@ -1280,6 +1281,9 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(last_force_op_resend, bl); ::encode(min_read_recency_for_promote, bl); ::encode(expected_num_objects, bl); + ::encode(uint32_t(.6 * 1e6), bl); + ::encode(uint32_t(1), bl); + ::encode(use_gmt_hitset, bl); ENCODE_FINISH(bl); } @@ -1397,6 +1401,17 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } else { expected_num_objects = 0; } + if (struct_v >= 19) { + uint32_t dummy; + ::decode(dummy, bl); + } + if (struct_v >= 20) { + uint32_t dummy; + ::decode(dummy, bl); + ::decode(use_gmt_hitset, bl); + } else { + use_gmt_hitset = false; + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -3789,19 +3804,25 @@ void pg_create_t::generate_test_instances(list& o) void pg_hit_set_info_t::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); ::encode(begin, bl); ::encode(end, bl); ::encode(version, bl); + ::encode(using_gmt, bl); ENCODE_FINISH(bl); } void pg_hit_set_info_t::decode(bufferlist::iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); ::decode(begin, p); ::decode(end, p); ::decode(version, p); + if (struct_v >= 2) { + ::decode(using_gmt, p); + } else { + using_gmt = false; + } DECODE_FINISH(p); } @@ -3810,6 +3831,7 @@ void pg_hit_set_info_t::dump(Formatter *f) const f->dump_stream("begin") << begin; f->dump_stream("end") << end; f->dump_stream("version") << version; + f->dump_stream("using_gmt") << using_gmt; } void pg_hit_set_info_t::generate_test_instances(list& ls) diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index b9b3b8150ab..ed6bbfc24b7 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1035,6 +1035,7 @@ public: HitSet::Params hit_set_params; ///< The HitSet params to use on this pool uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) uint32_t hit_set_count; ///< number of periods to retain + bool use_gmt_hitset; ///< use gmt to name the hitset archive object uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote uint32_t stripe_width; ///< erasure coded stripe size in bytes @@ -1063,6 +1064,7 @@ public: hit_set_params(), hit_set_period(0), hit_set_count(0), + use_gmt_hitset(true), min_read_recency_for_promote(0), stripe_width(0), expected_num_objects(0) @@ -1600,10 +1602,11 @@ WRITE_CLASS_ENCODER_FEATURES(pool_stat_t) struct pg_hit_set_info_t { utime_t begin, end; ///< time interval eversion_t version; ///< version this HitSet object was written - - pg_hit_set_info_t() {} - pg_hit_set_info_t(utime_t b) - : begin(b) {} + bool using_gmt; ///< use gmt for creating the hit_set archive object name + pg_hit_set_info_t(bool using_gmt = true) + : using_gmt(using_gmt) {} + pg_hit_set_info_t(utime_t b, bool using_gmt) + : begin(b), using_gmt(using_gmt) {} void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl);