OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
+#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<51) /* overlap with bitwise sort */
#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
*
*/
+#include <algorithm>
#include <sstream>
#include <boost/assign.hpp>
goto ignore;
}
+ if (any_of(osdmap.get_pools().begin(),
+ osdmap.get_pools().end(),
+ [](const std::pair<int64_t,pg_pool_t>& pool)
+ { return pool.second.use_gmt_hitset; })) {
+ assert(osdmap.get_num_up_osds() == 0 ||
+ osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
+ if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
+ dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
+ << m->get_orig_source_inst()
+ << " doesn't announce support -- ignore" << dendl;
+ goto ignore;
+ }
+ }
+
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
-
+ if (g_conf->osd_pool_use_gmt_hitset &&
+ (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
+ pi->use_gmt_hitset = true;
pi->size = size;
pi->min_size = min_size;
pi->crush_ruleset = crush_ruleset;
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
- oid = get_hit_set_archive_object(p->begin, p->end);
+ oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
break;
}
}
return hoid;
}
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt)
{
ostringstream ss;
- ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
+ ss << "hit_set_" << info.pgid.pgid << "_archive_";
+ if (using_gmt) {
+ start.gmtime(ss) << "_";
+ end.gmtime(ss);
+ } else {
+ start.localtime(ss) << "_";
+ end.localtime(ss);
+ }
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
// Once we hit a degraded object just skip
if (is_degraded_or_backfilling_object(aoid))
if (!info.hit_set.history.empty()) {
list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
assert(p != info.hit_set.history.rend());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
assert(!is_degraded_or_backfilling_object(oid));
ObjectContextRef obc = get_object_context(oid, false);
assert(obc);
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
utime_t start = info.hit_set.current_info.begin;
if (!start)
start = hit_set_start_stamp;
- oid = get_hit_set_archive_object(start, now);
+ oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
+ // If the current object is degraded we skip this persist request
if (scrubber.write_blocked_by_scrub(oid, get_sort_bitwise()))
return;
updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
- updated_hit_set_hist.current_info = pg_hit_set_info_t();
+ updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
updated_hit_set_hist.current_last_stamp = utime_t();
// fabricate an object_info_t and SnapSet
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
assert(!is_degraded_or_backfilling_object(oid));
continue;
}
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
void hit_set_remove_all();
hobject_t get_hit_set_current_object(utime_t stamp);
- hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
+ hobject_t get_hit_set_archive_object(utime_t start,
+ utime_t end,
+ bool using_gmt);
// agent
boost::scoped_ptr<TierAgentState> agent_state;
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
+ f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
::encode(expected_num_objects, bl);
::encode(cache_target_dirty_high_ratio_micro, bl);
::encode(min_write_recency_for_promote, bl);
+ ::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}
}
if (struct_v >= 20) {
::decode(min_write_recency_for_promote, bl);
+ ::decode(use_gmt_hitset, bl);
} else {
min_write_recency_for_promote = 1;
+ use_gmt_hitset = false;
}
DECODE_FINISH(bl);
calc_pg_masks();
void pg_hit_set_info_t::encode(bufferlist& bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
+ ::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}
void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
+ if (struct_v >= 2) {
+ ::decode(using_gmt, p);
+ } else {
+ using_gmt = false;
+ }
DECODE_FINISH(p);
}
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
+ f->dump_stream("using_gmt") << using_gmt;
}
void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
+ bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote on read
uint32_t min_write_recency_for_promote; ///< minimum number of HitSet to check before promote on write
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
+ use_gmt_hitset(true),
min_read_recency_for_promote(0),
min_write_recency_for_promote(0),
stripe_width(0),
struct pg_hit_set_info_t {
utime_t begin, end; ///< time interval
eversion_t version; ///< version this HitSet object was written
-
- pg_hit_set_info_t() {}
- pg_hit_set_info_t(utime_t b)
- : begin(b) {}
+ bool using_gmt; ///< use gmt for creating the hit_set archive object name
+ pg_hit_set_info_t(bool using_gmt = true)
+ : using_gmt(using_gmt) {}
+ pg_hit_set_info_t(utime_t b, bool using_gmt)
+ : begin(b), using_gmt(using_gmt) {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);