From: Zhiqiang Wang Date: Mon, 28 Jul 2014 06:06:06 +0000 (+0800) Subject: osd: promotion on 2nd read for cache tiering X-Git-Tag: v0.80.11~39^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7e2526784203b0f1bce08869aa7b1fda9c5eedd9;p=ceph.git osd: promotion on 2nd read for cache tiering http://tracker.ceph.com/issues/8674 Fixes: #8674 Signed-off-by: Zhiqiang Wang (cherry picked from commit 0ed3adc1e0a74bf9548d1d956aece11f019afee0) --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index d8ecdc70357..db929e767fd 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -452,6 +452,7 @@ OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback") OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4) OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200) OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom") +OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read) OPTION(osd_map_dedup, OPT_BOOL, true) OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size! diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 461b3f29e0a..d0908cba6ec 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -564,11 +564,11 @@ COMMAND("osd pool rename " \ "rename to ", "osd", "rw", "cli,rest") COMMAND("osd pool get " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote", \ "get pool parameter ", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote " \ "name=val,type=CephString " \ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ "set pool parameter to ", "osd", "rw", "cli,rest") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 861fac8cd90..ff9e160e32c 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -2592,6 +2592,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age); } else if (var == "erasure_code_profile") { f->dump_string("erasure_code_profile", p->erasure_code_profile); + } else if (var == "min_read_recency_for_promote") { + f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote); } f->close_section(); @@ -2641,6 +2643,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) ss << "cache_min_evict_age: " << p->cache_min_evict_age; } else if (var == "erasure_code_profile") { ss << "erasure_code_profile: " << p->erasure_code_profile; + } else if (var == "min_read_recency_for_promote") { + ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote; } rdata.append(ss); @@ -3757,6 +3761,12 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, return -EINVAL; } p.cache_min_evict_age = n; + } else if (var == "min_read_recency_for_promote") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + p.min_read_recency_for_promote = n; } else { ss << "unrecognized variable '" << var << "'"; return -EINVAL; @@ -5642,6 +5652,7 @@ done: ntp->cache_mode = mode; ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count; ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period; + ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote; ntp->hit_set_params = hsp; ntp->target_max_bytes = size; ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'"; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index c8fb01ebb77..04ed78ce8fe 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1353,7 +1353,10 @@ void ReplicatedPG::do_op(OpRequestRef op) } } + bool in_hit_set = false; if (hit_set) { + if (missing_oid != hobject_t() && hit_set->contains(missing_oid)) + in_hit_set = true; hit_set->insert(oid); if (hit_set->is_full() || hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) { @@ -1366,7 +1369,7 @@ void ReplicatedPG::do_op(OpRequestRef op) } if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 && - maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false)) + maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set)) return; if (r) { @@ -1561,7 +1564,8 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, bool write_ordered, ObjectContextRef obc, int r, const hobject_t& missing_oid, - bool must_promote) + bool must_promote, + bool in_hit_set) { if (obc) dout(25) << __func__ << " " << obc->obs.oi << " " @@ -1606,7 +1610,43 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, if (!must_promote && can_skip_promote(op, obc)) { return false; } - promote_object(op, obc, missing_oid); + if (op->may_write() || must_promote || !hit_set) { + promote_object(op, obc, missing_oid); + } else { + switch (pool.info.min_read_recency_for_promote) { + case 0: + promote_object(op, obc, missing_oid); + break; + case 1: + // Check if in the current hit set + if (in_hit_set) { + promote_object(op, obc, missing_oid); + } else { + do_cache_redirect(op, obc); + } + break; + default: + if (in_hit_set) { + promote_object(op, obc, missing_oid); + } else { + // Check if in other hit sets + map::iterator itor; + bool in_other_hit_sets = false; + for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); itor++) { + if (itor->second->contains(missing_oid)) { + in_other_hit_sets = true; + break; + } + } + if (in_other_hit_sets) { + promote_object(op, obc, missing_oid); + } else { + do_cache_redirect(op, obc); + } + } + break; + } + } return true; case pg_pool_t::CACHEMODE_FORWARD: @@ -10952,8 +10992,10 @@ void ReplicatedPG::hit_set_persist() info.hit_set.current_info.end = now; dout(20) << __func__ << " archive " << oid << dendl; - if (agent_state) + if (agent_state) { agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set); + hit_set_in_memory_trim(); + } // hold a ref until it is flushed to disk hit_set_flushing[info.hit_set.current_info.begin] = hit_set; @@ -11089,8 +11131,6 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) repop->ctx->op_t->remove(oid); repop->ctx->log.back().mod_desc.mark_unrollbackable(); } - if (agent_state) - agent_state->remove_oldest_hit_set(); updated_hit_set_hist.history.pop_front(); ObjectContextRef obc = get_object_context(oid, false); @@ -11101,6 +11141,19 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max) } } +void ReplicatedPG::hit_set_in_memory_trim() +{ + unsigned max = pool.info.hit_set_count; + unsigned max_in_memory = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0; + + if (max_in_memory > max) { + max_in_memory = max; + } + while (agent_state->hit_set_map.size() > max_in_memory) { + agent_state->remove_oldest_hit_set(); + } +} + // ======================================= // cache agent @@ -11293,6 +11346,9 @@ bool ReplicatedPG::agent_work(int start_max) else agent_state->position = next; + // Discard old in memory HitSets + hit_set_in_memory_trim(); + if (need_delay) { assert(agent_state->delaying == false); agent_delay(); @@ -11307,7 +11363,6 @@ bool ReplicatedPG::agent_work(int start_max) void ReplicatedPG::agent_load_hit_sets() { if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) { - agent_state->discard_hit_sets(); return; } diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 4b0d1d69441..c8e1c4b3ca6 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -800,6 +800,7 @@ protected: void hit_set_persist(); ///< persist hit info bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets + void hit_set_in_memory_trim(); ///< discard old in memory HitSets hobject_t get_hit_set_current_object(utime_t stamp); hobject_t get_hit_set_archive_object(utime_t start, utime_t end); @@ -1054,7 +1055,8 @@ protected: bool write_ordered, ObjectContextRef obc, int r, const hobject_t& missing_oid, - bool must_promote); + bool must_promote, + bool in_hit_set = false); /** * This helper function tells the client to redirect their request elsewhere. */ diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index c2ce8846f42..9069923ba9b 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -807,6 +807,7 @@ void pg_pool_t::dump(Formatter *f) const f->close_section(); // hit_set_params f->dump_unsigned("hit_set_period", hit_set_period); f->dump_unsigned("hit_set_count", hit_set_count); + f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote); f->dump_unsigned("stripe_width", get_stripe_width()); } @@ -1107,7 +1108,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const return; } - ENCODE_START(15, 5, bl); + ENCODE_START(16, 5, bl); ::encode(type, bl); ::encode(size, bl); ::encode(crush_ruleset, bl); @@ -1147,12 +1148,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(cache_min_evict_age, bl); ::encode(erasure_code_profile, bl); ::encode(last_force_op_resend, bl); + ::encode(min_read_recency_for_promote, bl); ENCODE_FINISH(bl); } void pg_pool_t::decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(15, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(16, 5, 5, bl); ::decode(type, bl); ::decode(size, bl); ::decode(crush_ruleset, bl); @@ -1254,6 +1256,12 @@ void pg_pool_t::decode(bufferlist::iterator& bl) } else { last_force_op_resend = 0; } + if (struct_v >= 16) { + ::decode(min_read_recency_for_promote, bl); + } else { + pg_pool_t def; + min_read_recency_for_promote = def.min_read_recency_for_promote; + } DECODE_FINISH(bl); calc_pg_masks(); } @@ -1299,6 +1307,7 @@ void pg_pool_t::generate_test_instances(list& o) a.hit_set_params = HitSet::Params(new BloomHitSet::Params); a.hit_set_period = 3600; a.hit_set_count = 8; + a.min_read_recency_for_promote = 1; a.set_stripe_width(12345); a.target_max_bytes = 1238132132; a.target_max_objects = 1232132; @@ -1351,6 +1360,8 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) << " " << p.hit_set_period << "s" << " x" << p.hit_set_count; } + if (p.min_read_recency_for_promote) + out << " min_read_recency_for_promote " << p.min_read_recency_for_promote; out << " stripe_width " << p.get_stripe_width(); return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index a296df04693..e7c45bfca3b 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -973,6 +973,7 @@ public: HitSet::Params hit_set_params; ///< The HitSet params to use on this pool uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds) uint32_t hit_set_count; ///< number of periods to retain + uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote uint32_t stripe_width; ///< erasure coded stripe size in bytes @@ -997,6 +998,7 @@ public: hit_set_params(), hit_set_period(0), hit_set_count(0), + min_read_recency_for_promote(0), stripe_width(0) { }