* bump the encoding version of pg_hit_set_info_t to 2, so we can
tell if the corresponding hit_set is named using localtime or
GMT
* bump the encoding version of pg_pool_t to 20, so we can know
if a pool is using GMT to name the hit_set archive or not. and
we can tell if current cluster allows OSDs not support GMT
mode or not.
* add an option named `osd_pool_use_gmt_hitset`. if enabled,
the cluster will try to use GMT mode when creating a new pool
if all the the up OSDs support GMT mode. if any of the
pools in the cluster is using GMT mode, then only OSDs
supporting GMT mode are allowed to join the cluster.
Fixes: #9732
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit
42f8c5daad16aa849a0b99871d50161673c0c370)
Conflicts:
src/include/ceph_features.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.cc
src/osd/osd_types.h
fill pg_pool_t with default settings in master branch.
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
/* ... */
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
*
*/
+#include <algorithm>
#include <sstream>
#include "OSDMonitor.h"
failure_info.clear();
}
+static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
+ return pool.second.use_gmt_hitset;
+}
// boot --
}
}
+ if (std::find_if(osdmap.get_pools().begin(),
+ osdmap.get_pools().end(),
+ uses_gmt_hitset) != osdmap.get_pools().end()) {
+ assert(osdmap.get_num_up_osds() == 0 ||
+ osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+ if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+ dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+ << m->get_orig_source_inst()
+ << " doesn't announce support -- ignore" << dendl;
+ goto ignore;
+ }
+ }
+
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
-
+ if (g_conf->osd_pool_use_gmt_hitset &&
+ (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+ pi->use_gmt_hitset = true;
pi->size = size;
pi->min_size = min_size;
pi->crush_ruleset = crush_ruleset;
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
- oid = get_hit_set_archive_object(p->begin, p->end);
+ oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
break;
}
}
return hoid;
}
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt)
{
ostringstream ss;
- ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+ ss << "hit_set_" << info.pgid.pgid << "_archive_";
+ if (using_gmt) {
+ start.gmtime(ss) << "_";
+ end.gmtime(ss);
+ } else {
+ start.localtime(ss) << "_";
+ end.localtime(ss);
+ }
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
return;
}
- oid = get_hit_set_archive_object(start, now);
+ oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
// If the current object is degraded we skip this persist request
- if (is_degraded_or_backfilling_object(oid))
- return;
if (scrubber.write_blocked_by_scrub(oid))
return;
updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
- updated_hit_set_hist.current_info = pg_hit_set_info_t();
+ updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
updated_hit_set_hist.current_last_stamp = utime_t();
// fabricate an object_info_t and SnapSet
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
assert(!is_degraded_or_backfilling_object(oid));
continue;
}
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
void hit_set_in_memory_trim(); ///< discard old in memory HitSets
hobject_t get_hit_set_current_object(utime_t stamp);
- hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+ hobject_t get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt);
// agent
boost::scoped_ptr<TierAgentState> agent_state;
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
+ f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
::encode(expected_num_objects, bl);
+ ::encode(uint32_t(.6 * 1e6), bl);
+ ::encode(uint32_t(1), bl);
+ ::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}
} else {
expected_num_objects = 0;
}
+ if (struct_v >= 19) {
+ uint32_t dummy;
+ ::decode(dummy, bl);
+ }
+ if (struct_v >= 20) {
+ uint32_t dummy;
+ ::decode(dummy, bl);
+ ::decode(use_gmt_hitset, bl);
+ } else {
+ use_gmt_hitset = false;
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
void pg_hit_set_info_t::encode(bufferlist& bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
+ ::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}
void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
+ if (struct_v >= 2) {
+ ::decode(using_gmt, p);
+ } else {
+ using_gmt = false;
+ }
DECODE_FINISH(p);
}
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
+ f->dump_stream("using_gmt") << using_gmt;
}
void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
+ bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
uint32_t stripe_width; ///< erasure coded stripe size in bytes
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
+ use_gmt_hitset(true),
min_read_recency_for_promote(0),
stripe_width(0),
expected_num_objects(0)
struct pg_hit_set_info_t {
utime_t begin, end; ///< time interval
eversion_t version; ///< version this HitSet object was written
-
- pg_hit_set_info_t() {}
- pg_hit_set_info_t(utime_t b)
- : begin(b) {}
+ bool using_gmt; ///< use gmt for creating the hit_set archive object name
+ pg_hit_set_info_t(bool using_gmt = true)
+ : using_gmt(using_gmt) {}
+ pg_hit_set_info_t(utime_t b, bool using_gmt)
+ : begin(b), using_gmt(using_gmt) {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);