OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
-OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
-#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
/* ... */
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
CEPH_FEATURE_MDS_QUOTA | \
CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \
- CEPH_FEATURE_OSD_HITSET_GMT | \
CEPH_FEATURE_HAMMER_0_94_4 | \
0ULL)
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|write_fadvise_dontneed " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
*
*/
-#include <algorithm>
#include <sstream>
#include "OSDMonitor.h"
failure_info.clear();
}
-static bool uses_gmt_hitset(const std::pair<int64_t, pg_pool_t>& pool) {
- return pool.second.use_gmt_hitset;
-}
// boot --
}
}
- if (std::find_if(osdmap.get_pools().begin(),
- osdmap.get_pools().end(),
- uses_gmt_hitset) != osdmap.get_pools().end()) {
- assert(osdmap.get_num_up_osds() == 0 ||
- osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT);
- if (!(m->osd_features & CEPH_FEATURE_OSD_HITSET_GMT)) {
- dout(0) << __func__ << " one or more pools uses GMT hitsets but osd at "
- << m->get_orig_source_inst()
- << " doesn't announce support -- ignore" << dendl;
- goto ignore;
- }
- }
-
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
if (!p->is_tier() &&
(var == "hit_set_type" || var == "hit_set_period" ||
var == "hit_set_count" || var == "hit_set_fpp" ||
- var == "use_gmt_hitset" ||
var == "target_max_objects" || var == "target_max_bytes" ||
var == "cache_target_full_ratio" ||
var == "cache_target_dirty_ratio" ||
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
f->dump_float("hit_set_fpp", bloomp->get_fpp());
}
- } else if (var == "use_gmt_hitset") {
- f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
} else if (var == "target_max_objects") {
f->dump_unsigned("target_max_objects", p->target_max_objects);
} else if (var == "target_max_bytes") {
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
ss << "hit_set_fpp: " << bloomp->get_fpp();
- } else if (var == "use_gmt_hitset") {
- ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
} else if (var == "target_max_objects") {
ss << "target_max_objects: " << p->target_max_objects;
} else if (var == "target_max_bytes") {
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
if (g_conf->osd_pool_default_flag_nosizechange)
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
- if (g_conf->osd_pool_use_gmt_hitset &&
- (osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT))
- pi->use_gmt_hitset = true;
- else
- pi->use_gmt_hitset = false;
pi->size = size;
pi->min_size = min_size;
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
bloomp->set_fpp(f);
- } else if (var == "use_gmt_hitset") {
- if (val == "true" || (interr.empty() && n == 1)) {
- if (!(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_HITSET_GMT)) {
- ss << "not all OSDs support GMT hit set.";
- return -EINVAL;
- }
- p.use_gmt_hitset = true;
- } else {
- ss << "expecting value 'true' or '1'";
- return -EINVAL;
- }
} else if (var == "debug_fake_ec_pool") {
if (val == "true" || (interr.empty() && n == 1)) {
p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get())));
break;
+ case TYPE_NONE:
+ break;
+
default:
assert (0 == "unknown HitSet type");
}
p != info.hit_set.history.end();
++p) {
if (stamp >= p->begin && stamp <= p->end) {
- oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ oid = get_hit_set_archive_object(p->begin, p->end);
break;
}
}
return hoid;
}
-hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start,
- utime_t end,
- bool using_gmt)
+hobject_t ReplicatedPG::get_hit_set_archive_object(utime_t start, utime_t end)
{
ostringstream ss;
- ss << "hit_set_" << info.pgid.pgid << "_archive_";
- if (using_gmt) {
- start.gmtime(ss) << "_";
- end.gmtime(ss);
- } else {
- start.localtime(ss) << "_";
- end.localtime(ss);
- }
+ ss << "hit_set_" << info.pgid.pgid << "_archive_" << start << "_" << end;
hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
info.pgid.ps(), info.pgid.pool(),
cct->_conf->osd_hit_set_namespace);
void ReplicatedPG::hit_set_setup()
{
if (!is_active() ||
- !is_primary()) {
- hit_set_clear();
- return;
- }
-
- if (is_active() && is_primary() &&
- (!pool.info.hit_set_count ||
- !pool.info.hit_set_period ||
- pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
+ !is_primary() ||
+ !pool.info.hit_set_count ||
+ !pool.info.hit_set_period ||
+ pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
hit_set_clear();
-
- // only primary is allowed to remove all the hit set objects
- hit_set_remove_all();
+ //hit_set_remove_all(); // FIXME: implement me soon
return;
}
hit_set_apply_log();
}
-void ReplicatedPG::hit_set_remove_all()
-{
- // If any archives are degraded we skip this
- for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
- p != info.hit_set.history.end();
- ++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
-
- // Once we hit a degraded object just skip
- if (is_degraded_or_backfilling_object(aoid))
- return;
- if (scrubber.write_blocked_by_scrub(aoid))
- return;
- }
-
- if (!info.hit_set.history.empty()) {
- list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
- assert(p != info.hit_set.history.rend());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
- assert(!is_degraded_or_backfilling_object(oid));
- ObjectContextRef obc = get_object_context(oid, false);
- assert(obc);
-
- RepGather *repop = simple_repop_create(obc);
- OpContext *ctx = repop->ctx;
- ctx->at_version = get_next_version();
- ctx->updated_hset_history = info.hit_set;
- utime_t now = ceph_clock_now(cct);
- ctx->mtime = now;
- hit_set_trim(repop, 0);
- info.stats.stats.add(ctx->delta_stats);
- simple_repop_submit(repop);
- }
-
- info.hit_set = pg_hit_set_history_t();
- if (agent_state) {
- agent_state->discard_hit_sets();
- }
-}
-
void ReplicatedPG::hit_set_create()
{
utime_t now = ceph_clock_now(NULL);
for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
p != info.hit_set.history.end();
++p) {
- hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ hobject_t aoid = get_hit_set_archive_object(p->begin, p->end);
// Once we hit a degraded object just skip further trim
if (is_degraded_or_backfilling_object(aoid))
return;
}
- oid = get_hit_set_archive_object(start, now, pool.info.use_gmt_hitset);
+ oid = get_hit_set_archive_object(start, now);
// If the current object is degraded we skip this persist request
+ if (is_degraded_or_backfilling_object(oid))
+ return;
if (scrubber.write_blocked_by_scrub(oid))
return;
updated_hit_set_hist.history.push_back(updated_hit_set_hist.current_info);
hit_set_create();
- updated_hit_set_hist.current_info = pg_hit_set_info_t(pool.info.use_gmt_hitset);
+ updated_hit_set_hist.current_info = pg_hit_set_info_t();
updated_hit_set_hist.current_last_stamp = utime_t();
// fabricate an object_info_t and SnapSet
for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
assert(p != updated_hit_set_hist.history.end());
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
assert(!is_degraded_or_backfilling_object(oid));
continue;
}
- hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+ hobject_t oid = get_hit_set_archive_object(p->begin, p->end);
if (is_unreadable_object(oid)) {
dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
break;
bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
void hit_set_in_memory_trim(); ///< discard old in memory HitSets
- void hit_set_remove_all();
hobject_t get_hit_set_current_object(utime_t stamp);
- hobject_t get_hit_set_archive_object(utime_t start,
- utime_t end,
- bool using_gmt);
+ hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
// agent
boost::scoped_ptr<TierAgentState> agent_state;
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
- f->dump_bool("use_gmt_hitset", use_gmt_hitset);
f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
f->dump_unsigned("expected_num_objects", expected_num_objects);
return;
}
- ENCODE_START(21, 5, bl);
+ ENCODE_START(17, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
::encode(last_force_op_resend, bl);
::encode(min_read_recency_for_promote, bl);
::encode(expected_num_objects, bl);
- ::encode(uint32_t(.6 * 1e6), bl);
- ::encode(uint32_t(1), bl);
- ::encode(use_gmt_hitset, bl);
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(21, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(17, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
} else {
expected_num_objects = 0;
}
- if (struct_v >= 19) {
- uint32_t dummy;
- ::decode(dummy, bl);
- }
- if (struct_v >= 20) {
- uint32_t dummy;
- ::decode(dummy, bl);
- }
- if (struct_v >= 21) {
- ::decode(use_gmt_hitset, bl);
- } else {
- use_gmt_hitset = false;
- }
DECODE_FINISH(bl);
calc_pg_masks();
}
void pg_hit_set_info_t::encode(bufferlist& bl) const
{
- ENCODE_START(2, 1, bl);
+ ENCODE_START(1, 1, bl);
::encode(begin, bl);
::encode(end, bl);
::encode(version, bl);
- ::encode(using_gmt, bl);
ENCODE_FINISH(bl);
}
void pg_hit_set_info_t::decode(bufferlist::iterator& p)
{
- DECODE_START(2, p);
+ DECODE_START(1, p);
::decode(begin, p);
::decode(end, p);
::decode(version, p);
- if (struct_v >= 2) {
- ::decode(using_gmt, p);
- } else {
- using_gmt = false;
- }
DECODE_FINISH(p);
}
f->dump_stream("begin") << begin;
f->dump_stream("end") << end;
f->dump_stream("version") << version;
- f->dump_stream("using_gmt") << using_gmt;
}
void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
- bool use_gmt_hitset; ///< use gmt to name the hitset archive object
uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
uint32_t stripe_width; ///< erasure coded stripe size in bytes
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
- use_gmt_hitset(true),
min_read_recency_for_promote(0),
stripe_width(0),
expected_num_objects(0)
struct pg_hit_set_info_t {
utime_t begin, end; ///< time interval
eversion_t version; ///< version this HitSet object was written
- bool using_gmt; ///< use gmt for creating the hit_set archive object name
- pg_hit_set_info_t(bool using_gmt = true)
- : using_gmt(using_gmt) {}
- pg_hit_set_info_t(utime_t b, bool using_gmt)
- : begin(b), using_gmt(using_gmt) {}
+
+ pg_hit_set_info_t() {}
+ pg_hit_set_info_t(utime_t b)
+ : begin(b) {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);