From: Sage Weil Date: Mon, 1 Feb 2016 15:50:44 +0000 (-0500) Subject: osd: probabilistic promotion throttling X-Git-Tag: v10.1.0~145^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=46641a9b73ea9709e451d8a78e799352f4b2ba7b;p=ceph.git osd: probabilistic promotion throttling Global tunables osd_tier_promote_max_{objects,bytes}_sec adjust a probability that any given promotion we might otherwise do will be done or rejected. Based on the observed rate of promotion, and the target/max, we will adjust the probability up for down. This avoids the complexity of a throttle queue and gets us a reasonably accurate throttle. What it doesn't yet do is cope with thundering herds of promotions. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 03d4c6abf00..30ceeac2ebd 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -633,6 +633,9 @@ OPTION(osd_hit_set_min_size, OPT_INT, 1000) // min target size for a HitSet OPTION(osd_hit_set_max_size, OPT_INT, 100000) // max target size for a HitSet OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking +OPTION(osd_tier_promote_max_objects_sec, OPT_U64, 0) +OPTION(osd_tier_promote_max_bytes_sec, OPT_U64, 0) + OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback") OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4) OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f5eeaa62569..0bcdead51b1 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -235,6 +235,7 @@ OSDService::OSDService(OSD *osd) : agent_stop_flag(false), agent_timer_lock("OSD::agent_timer_lock"), agent_timer(osd->client_messenger->cct, agent_timer_lock), + promote_probability_millis(1000), objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)), objecter_finisher(osd->client_messenger->cct), watch_lock("OSD::watch_lock"), @@ -596,6 +597,74 @@ void OSDService::agent_stop() // ------------------------------------- +void OSDService::promote_throttle_recalibrate() +{ + utime_t now = ceph_clock_now(NULL); + double dur = now - last_recalibrate; + last_recalibrate = now; + unsigned prob = promote_probability_millis; + + uint64_t target_obj_sec = g_conf->osd_tier_promote_max_objects_sec; + uint64_t target_bytes_sec = g_conf->osd_tier_promote_max_bytes_sec; + + unsigned min_prob = 1; + + uint64_t attempts, obj, bytes; + promote_counter.sample_and_attenuate(&attempts, &obj, &bytes); + dout(10) << __func__ << " " << attempts << " attempts, promoted " + << obj << " objects and " << pretty_si_t(bytes) << " bytes; target " + << target_obj_sec << " obj/sec or " + << pretty_si_t(target_bytes_sec) << " bytes/sec" + << dendl; + + // calculate what the probability *should* be, given the targets + unsigned new_prob; + if (attempts && dur > 0) { + uint64_t avg_size = 1; + if (obj) + avg_size = MAX(bytes / obj, 1); + unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts; + unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0 + / (double)attempts; + derr << __func__ << " po " << po << " pb " << pb << " avg_size " << avg_size << dendl; + if (target_obj_sec && target_bytes_sec) + new_prob = MIN(po, pb); + else if (target_obj_sec) + new_prob = po; + else if (target_bytes_sec) + new_prob = pb; + else + new_prob = 1000; + } else { + new_prob = 1000; + } + dout(20) << __func__ << " new_prob " << new_prob << dendl; + + // correct for persistent skew between target rate and actual rate, adjust + double ratio = 1.0; + unsigned actual = 0; + if (attempts && obj) { + actual = obj * 1000 / attempts; + ratio = (double)actual / (double)prob; + new_prob = (double)new_prob / ratio; + } + new_prob = MAX(new_prob, min_prob); + new_prob = MIN(new_prob, 1000); + + // adjust + prob = (prob + new_prob) / 2; + prob = MAX(prob, min_prob); + prob = MIN(prob, 1000); + dout(10) << __func__ << " actual " << actual + << ", actual/prob ratio " << ratio + << ", adjusted new_prob " << new_prob + << ", prob " << promote_probability_millis << " -> " << prob + << dendl; + promote_probability_millis = prob; +} + +// ------------------------------------- + float OSDService::get_full_ratio() { float full_ratio = cct->_conf->osd_failsafe_full_ratio; @@ -4120,6 +4189,8 @@ void OSD::tick() recovery_tp.wake(); check_replay_queue(); + + service.promote_throttle_recalibrate(); } // only do waiters if dispatch() isn't currently running. (if it is, diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 57848540204..6f29ce9508d 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -781,6 +781,22 @@ public: flush_mode_high_count --; } + /// throttle promotion attempts + unsigned promote_probability_millis; ///< probability thousands. one word. + PromoteCounter promote_counter; + utime_t last_recalibrate; + + bool promote_throttle() { + // NOTE: lockless! we rely on the probability being a single word. + promote_counter.attempt(); + if ((unsigned)rand() % 1000 > promote_probability_millis) + return true; // yes throttle (no promote) + return false; // no throttle (promote) + } + void promote_finish(uint64_t bytes) { + promote_counter.finish(bytes); + } + void promote_throttle_recalibrate(); // -- Objecter, for teiring reads/writes from/to other OSDs -- Objecter *objecter; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4402ec2b0be..fe3d2cb3c8c 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2335,12 +2335,11 @@ bool ReplicatedPG::maybe_promote(ObjectContextRef obc, switch (recency) { case 0: - promote_object(obc, missing_oid, oloc, promote_op, promote_obc); break; case 1: // Check if in the current hit set if (in_hit_set) { - promote_object(obc, missing_oid, oloc, promote_op, promote_obc); + break; } else { // not promoting return false; @@ -2366,15 +2365,18 @@ bool ReplicatedPG::maybe_promote(ObjectContextRef obc, } } if (count >= recency) { - promote_object(obc, missing_oid, oloc, promote_op, promote_obc); - } else { - // not promoting - return false; + break; } + return false; // not promoting } break; } + if (osd->promote_throttle()) { + dout(10) << __func__ << " promote throttled" << dendl; + return false; + } + promote_object(obc, missing_oid, oloc, promote_op, promote_obc); return true; } @@ -7608,6 +7610,8 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results, return; } + osd->promote_finish(results->object_size); + OpContextUPtr tctx = simple_opc_create(obc); tctx->at_version = get_next_version(); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 49b4e2bd8ec..a3ef16a2bca 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -4296,4 +4296,29 @@ enum scrub_error_type { DEEP_ERROR, SHALLOW_ERROR }; + +// PromoteCounter + +struct PromoteCounter { + atomic64_t attempts, objects, bytes; + + void attempt() { + attempts.inc(); + } + + void finish(uint64_t size) { + objects.inc(); + bytes.add(size); + } + + void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) { + *a = attempts.read(); + *o = objects.read(); + *b = bytes.read(); + attempts.set(*a / 2); + objects.set(*o / 2); + bytes.set(*b / 2); + } +}; + #endif