]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: probabilistic promotion throttling
authorSage Weil <sage@redhat.com>
Mon, 1 Feb 2016 15:50:44 +0000 (10:50 -0500)
committerSage Weil <sage@redhat.com>
Tue, 1 Mar 2016 13:43:46 +0000 (08:43 -0500)
Global tunables osd_tier_promote_max_{objects,bytes}_sec adjust a
probability that any given promotion we might otherwise do will be done
or rejected.  Based on the observed rate of promotion, and the target/max,
we will adjust the probability up for down.

This avoids the complexity of a throttle queue and gets us a reasonably
accurate throttle.  What it doesn't yet do is cope with thundering herds of
promotions.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/config_opts.h
src/osd/OSD.cc
src/osd/OSD.h
src/osd/ReplicatedPG.cc
src/osd/osd_types.h

index 03d4c6abf00844b41054e75ffb6df109afa58d39..30ceeac2ebdc247e234466338e3540455a808027 100644 (file)
@@ -633,6 +633,9 @@ OPTION(osd_hit_set_min_size, OPT_INT, 1000)  // min target size for a HitSet
 OPTION(osd_hit_set_max_size, OPT_INT, 100000)  // max target size for a HitSet
 OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking
 
+OPTION(osd_tier_promote_max_objects_sec, OPT_U64, 0)
+OPTION(osd_tier_promote_max_bytes_sec, OPT_U64, 0)
+
 OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
 OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
 OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
index f5eeaa6256916f68965add2820d872b522e9b21e..0bcdead51b11edbd87e10030371bf9aa894b19a9 100644 (file)
@@ -235,6 +235,7 @@ OSDService::OSDService(OSD *osd) :
   agent_stop_flag(false),
   agent_timer_lock("OSD::agent_timer_lock"),
   agent_timer(osd->client_messenger->cct, agent_timer_lock),
+  promote_probability_millis(1000),
   objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
   objecter_finisher(osd->client_messenger->cct),
   watch_lock("OSD::watch_lock"),
@@ -596,6 +597,74 @@ void OSDService::agent_stop()
 
 // -------------------------------------
 
+void OSDService::promote_throttle_recalibrate()
+{
+  utime_t now = ceph_clock_now(NULL);
+  double dur = now - last_recalibrate;
+  last_recalibrate = now;
+  unsigned prob = promote_probability_millis;
+
+  uint64_t target_obj_sec = g_conf->osd_tier_promote_max_objects_sec;
+  uint64_t target_bytes_sec = g_conf->osd_tier_promote_max_bytes_sec;
+
+  unsigned min_prob = 1;
+
+  uint64_t attempts, obj, bytes;
+  promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
+  dout(10) << __func__ << " " << attempts << " attempts, promoted "
+          << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
+          << target_obj_sec << " obj/sec or "
+          << pretty_si_t(target_bytes_sec) << " bytes/sec"
+          << dendl;
+
+  // calculate what the probability *should* be, given the targets
+  unsigned new_prob;
+  if (attempts && dur > 0) {
+    uint64_t avg_size = 1;
+    if (obj)
+      avg_size = MAX(bytes / obj, 1);
+    unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
+    unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
+      / (double)attempts;
+    derr << __func__ << "  po " << po << " pb " << pb << " avg_size " << avg_size << dendl;
+    if (target_obj_sec && target_bytes_sec)
+      new_prob = MIN(po, pb);
+    else if (target_obj_sec)
+      new_prob = po;
+    else if (target_bytes_sec)
+      new_prob = pb;
+    else
+      new_prob = 1000;
+  } else {
+    new_prob = 1000;
+  }
+  dout(20) << __func__ << "  new_prob " << new_prob << dendl;
+
+  // correct for persistent skew between target rate and actual rate, adjust
+  double ratio = 1.0;
+  unsigned actual = 0;
+  if (attempts && obj) {
+    actual = obj * 1000 / attempts;
+    ratio = (double)actual / (double)prob;
+    new_prob = (double)new_prob / ratio;
+  }
+  new_prob = MAX(new_prob, min_prob);
+  new_prob = MIN(new_prob, 1000);
+
+  // adjust
+  prob = (prob + new_prob) / 2;
+  prob = MAX(prob, min_prob);
+  prob = MIN(prob, 1000);
+  dout(10) << __func__ << "  actual " << actual
+          << ", actual/prob ratio " << ratio
+          << ", adjusted new_prob " << new_prob
+          << ", prob " << promote_probability_millis << " -> " << prob
+          << dendl;
+  promote_probability_millis = prob;
+}
+
+// -------------------------------------
+
 float OSDService::get_full_ratio()
 {
   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
@@ -4120,6 +4189,8 @@ void OSD::tick()
     recovery_tp.wake();
 
     check_replay_queue();
+
+    service.promote_throttle_recalibrate();
   }
 
   // only do waiters if dispatch() isn't currently running.  (if it is,
index 57848540204f1a92f4734c15cb1e918b6b3c6755..6f29ce9508d20a84af78f7175b58a816e300a1cf 100644 (file)
@@ -781,6 +781,22 @@ public:
     flush_mode_high_count --;
   }
 
+  /// throttle promotion attempts
+  unsigned promote_probability_millis; ///< probability thousands. one word.
+  PromoteCounter promote_counter;
+  utime_t last_recalibrate;
+
+  bool promote_throttle() {
+    // NOTE: lockless!  we rely on the probability being a single word.
+    promote_counter.attempt();
+    if ((unsigned)rand() % 1000 > promote_probability_millis)
+      return true;  //  yes throttle (no promote)
+    return false;   //   no throttle (promote)
+  }
+  void promote_finish(uint64_t bytes) {
+    promote_counter.finish(bytes);
+  }
+  void promote_throttle_recalibrate();
 
   // -- Objecter, for teiring reads/writes from/to other OSDs --
   Objecter *objecter;
index 4402ec2b0be08defada0fa14bf6c50c050555b88..fe3d2cb3c8cfeadea3d1112aca518548d4a5f5a1 100644 (file)
@@ -2335,12 +2335,11 @@ bool ReplicatedPG::maybe_promote(ObjectContextRef obc,
 
   switch (recency) {
   case 0:
-    promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
     break;
   case 1:
     // Check if in the current hit set
     if (in_hit_set) {
-      promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
+      break;
     } else {
       // not promoting
       return false;
@@ -2366,15 +2365,18 @@ bool ReplicatedPG::maybe_promote(ObjectContextRef obc,
        }
       }
       if (count >= recency) {
-       promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
-      } else {
-       // not promoting
-       return false;
+       break;
       }
+      return false;    // not promoting
     }
     break;
   }
 
+  if (osd->promote_throttle()) {
+    dout(10) << __func__ << " promote throttled" << dendl;
+    return false;
+  }
+  promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
   return true;
 }
 
@@ -7608,6 +7610,8 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
     return;
   }
 
+  osd->promote_finish(results->object_size);
+
   OpContextUPtr tctx =  simple_opc_create(obc);
   tctx->at_version = get_next_version();
 
index 49b4e2bd8ec38d5b556d8cd139929b2fd0c98129..a3ef16a2bcaf52194e0ccd3b1bea0c769aaa9862 100644 (file)
@@ -4296,4 +4296,29 @@ enum scrub_error_type {
   DEEP_ERROR,
   SHALLOW_ERROR
 };
+
+// PromoteCounter
+
+struct PromoteCounter {
+  atomic64_t attempts, objects, bytes;
+
+  void attempt() {
+    attempts.inc();
+  }
+
+  void finish(uint64_t size) {
+    objects.inc();
+    bytes.add(size);
+  }
+
+  void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
+    *a = attempts.read();
+    *o = objects.read();
+    *b = bytes.read();
+    attempts.set(*a / 2);
+    objects.set(*o / 2);
+    bytes.set(*b / 2);
+  }
+};
+
 #endif