]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: SlowFastCoDel added to the BlueStore
authorEsmaeil Mirvakili <smirvaki@ucsc.edu>
Tue, 1 Feb 2022 10:18:51 +0000 (02:18 -0800)
committeresmaeil-mirvakili <smirvaki@ucsc.edu>
Tue, 22 Mar 2022 14:22:12 +0000 (07:22 -0700)
Signed-off-by: Esmaeil Mirvakili <smirvaki@ucsc.edu>
doc/dev/bluestore-codel.rst [new file with mode: 0644]
qa/objectstore_debug/bluestore-codel.yaml [new file with mode: 0644]
src/common/options/global.yaml.in
src/common/regression_utils.h [new file with mode: 0644]
src/crimson/os/alienstore/CMakeLists.txt
src/os/CMakeLists.txt
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/BlueStoreSlowFastCoDel.cc [new file with mode: 0644]
src/os/bluestore/BlueStoreSlowFastCoDel.h [new file with mode: 0644]

diff --git a/doc/dev/bluestore-codel.rst b/doc/dev/bluestore-codel.rst
new file mode 100644 (file)
index 0000000..4aa0154
--- /dev/null
@@ -0,0 +1,41 @@
+============================================
+BlueStore Bufferbloat Mitigation Using CoDel
+============================================
+
+
+Introduction
+------------
+Bufferbloat happens when a frontend buffer too much data to a backend.
+This can introduce latency spikes to the backend and compromise the
+request schedulability of the frontend.
+
+BlueStore has the bufferbloat problem due to its large queue. All
+write requests are submitted immediately to BlueStore to achieve high
+performance. However, this can compromise request schedulability in OSD.
+As a solution, the CoDel algorithm is implemented in the BlueStore as
+an admission control system to control the amount of transaction
+submitted to BlueStore. This mechanism will negatively impact the
+throughput of BlueStore. However, a tradeoff parameter has been introduced
+to control BlueStore throughput loss versus BlueStore latency decrease.
+
+Configurations
+--------------
+CoDel can be enabled using "*bluestore_codel*" config. The other important
+config that needs to be set is "*bluestore_codel_throughput_latency_tradeoff*".
+This config adjust the tradeoff between BlueStore throughput loss and
+BlueStore latency decrease. This parameter defines the amount of throughput
+loss in MB/s for one ms decrease in BlueStore latency. For example, a value
+of 5 means that we are willing to lose maximum of 5 MB/s of throughput for
+every 1 ms decrease in BlueStore latency.
+
+Experiments
+-----------
+For measuring the impact of BlueStore CoDel on BlueStore, we measured the
+transaction latency inside the BlueStore (BlueStore latency) and BlueStore
+throughput. We compared this measurements with measurements from Vanilla BlueStore.
+These experiments shows that:
+
+1. The BlueStore CoDel can decrease the BlueStore latency by small and controllable
+impact on throughput.
+2. The BlueStore CoDel can react to workload changes to keep the desired tradeoff
+between latency and throughput.
diff --git a/qa/objectstore_debug/bluestore-codel.yaml b/qa/objectstore_debug/bluestore-codel.yaml
new file mode 100644 (file)
index 0000000..40f6fc0
--- /dev/null
@@ -0,0 +1,45 @@
+overrides:
+  thrashosds:
+    bdev_inject_crash: 2
+    bdev_inject_crash_probability: .5
+  ceph:
+    fs: xfs
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        bluestore allocator: bitmap
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        # this doesn't work with failures bc the log writes are not atomic across the two backends
+        #        bluestore bluefs env mirror: true
+        bdev enable discard: true
+        bdev async discard: true
+        bluestore codel: true
+  ceph-deploy:
+    fs: xfs
+    bluestore: yes
+    conf:
+      osd:
+        osd objectstore: bluestore
+        bluestore block size: 96636764160
+        debug bluestore: 20
+        debug bluefs: 20
+        debug rocksdb: 10
+        bluestore fsck on mount: true
+        # lower the full ratios since we can fill up a 100gb osd so quickly
+        mon osd full ratio: .9
+        mon osd backfillfull_ratio: .85
+        mon osd nearfull ratio: .8
+        osd failsafe full ratio: .95
+        bdev enable discard: true
+        bdev async discard: true
+        bluestore codel: true
+
index e4e271a1b12e612ef7e1232e5b70c65b8b221a03..5183360b7301a47c05cdaed57ad5448f78fd68a8 100644 (file)
@@ -6312,3 +6312,72 @@ options:
   default: 0
   services:
   - mgr
+- name: bluestore_codel
+  type: bool
+  level: advanced
+  desc: enable/disable bluestore SlowFastCodel
+  default: false
+  with_legacy: true
+- name: bluestore_codel_throughput_latency_tradeoff
+  type: float
+  level: advanced
+  desc: adjust the tradeoff between throughput and bluestore latency in SlowFastCodel
+  long_desc: This parameter defines the amount of throughput loss (MB/s) for one ms
+    decrease in bluestore latency. (a value of 5 means that we are willing to lose
+    maximum of 5 MB/s of throughput for every 1 ms decrease in bluestore latency)
+  default: 5
+  with_legacy: true
+- name: bluestore_codel_initial_target_latency
+  type: float
+  level: advanced
+  desc: initial target latency for SlowFastCodel in ms
+  default: 5.0
+  with_legacy: true
+- name: bluestore_codel_slow_interval
+  type: float
+  level: advanced
+  desc: the interval of slow loop in SlowFastCodel in ms (this parameter should be larger that 'bluestore_codel_fast_interval')
+  default: 500.0
+  with_legacy: true
+- name: bluestore_codel_fast_interval
+  type: float
+  level: advanced
+  desc: the interval of the fast loop in SlowFastCodel in ms
+  default: 50.0
+  with_legacy: true
+- name: bluestore_codel_min_target_latency
+  type: float
+  level: advanced
+  desc: the minimum possible target latency in SlowFastCodel in ms
+  default: 1.0
+  with_legacy: true
+- name: bluestore_codel_max_target_latency
+  type: float
+  level: advanced
+  desc: the maximum possible target latency in SlowFastCodel in ms
+  default: 1000.0
+  with_legacy: true
+- name: bluestore_codel_initial_budget_bytes
+  type: size
+  level: advanced
+  desc: the initial bluestore throttle budget in SlowFastCodel
+  default: 100_K
+  with_legacy: true
+- name: bluestore_codel_min_budget_bytes
+  type: size
+  level: advanced
+  desc: the minimum bluestore throttle budget in SlowFastCodel
+  default: 100_K
+  with_legacy: true
+- name: bluestore_codel_budget_increment_bytes
+  type: size
+  level: advanced
+  desc: the increment size for opening the bluestore throttle in SlowFastCodel
+  default: 10_K
+  with_legacy: true
+- name: bluestore_codel_regression_history_size
+  type: int
+  level: advanced
+  desc: number of the slow interval throughput and latency samples that SlowFastCodel keeps for regression
+  default: 100
+  with_legacy: true
diff --git a/src/common/regression_utils.h b/src/common/regression_utils.h
new file mode 100644 (file)
index 0000000..8f21829
--- /dev/null
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <boost/numeric/ublas/matrix.hpp>
+
+#define Z_P 2.33  // z score for 99th percentile
+
+
+namespace ceph {
+  /***
+  * Calculate the inverse of a 2x2 matrix.
+  * @param matrix<double>& m, an square 2x2 matrix
+  * @return the inverse of the m (m^-1)
+  */
+  static boost::numeric::ublas::matrix<double>
+  matrix_inverse(boost::numeric::ublas::matrix<double> &m) {
+    assert(m.size1() == m.size2() &&
+           "Can only calculate the inverse of square matrices");
+    assert(m.size1() == 2 && m.size2() == 2 && "Only for 2x2 matrices");
+    boost::numeric::ublas::matrix<double> m_inverse(2, 2);
+    const double a = m(0, 0);
+    const double b = m(0, 1);
+    const double c = m(1, 0);
+    const double d = m(1, 1);
+    const double determinant = 1.0 / ((a * d) - (b * c));
+    m_inverse(0, 0) = d * determinant;
+    m_inverse(0, 1) = -b * determinant;
+    m_inverse(1, 0) = -c * determinant;
+    m_inverse(1, 1) = a * determinant;
+    return m_inverse;
+  }
+
+  /***
+  * Find a logarithmic function in form of "y = a + b * ln(x)" which fits
+  * the given points (x_values and y_values).
+  * @param std::vector<double> x_values, x values for sample points
+  * @param std::vector<double> y_values, y values for sample points
+  * @param double theta[2], holds the a and b as output (theta[0] = a and theta[1] = b)
+  */
+  static void regression(
+    const std::vector<double> &x_values,
+    const std::vector<double> &y_values,
+    double theta[2]) {
+    assert(x_values.size() == y_values.size() &&
+           "x and y values vectors should have a same size.");
+    const int n = x_values.size();
+
+    boost::numeric::ublas::matrix<double> y_m(n, 1);
+    for (int i = 0; i < n; i++) {
+      y_m(i, 0) = y_values[i];
+    }
+
+    boost::numeric::ublas::scalar_matrix<double> sm(n, 2, 1);
+    boost::numeric::ublas::matrix<double> x_new_m(sm);
+    for (int i = 0; i < n; i++) {
+      x_new_m(i, 0) = 1;
+      x_new_m(i, 1) = std::log(x_values[i]);
+    }
+    boost::numeric::ublas::matrix<double> x_new_trans_m = boost::numeric::ublas::trans(
+      x_new_m);
+    boost::numeric::ublas::matrix<double> x_new_trans_dot_x_new_m = boost::numeric::ublas::prod(
+      x_new_trans_m, x_new_m);
+    boost::numeric::ublas::matrix<double> temp_1_m = matrix_inverse(
+      x_new_trans_dot_x_new_m);
+    boost::numeric::ublas::matrix<double> temp_2_m = boost::numeric::ublas::prod(
+      x_new_trans_m, y_m);
+    boost::numeric::ublas::matrix<double> theta_m = boost::numeric::ublas::prod(
+      temp_1_m, temp_2_m);
+    theta[0] = theta_m(0, 0);
+    theta[1] = theta_m(1, 0);
+  }
+
+  /***
+  * Finds the x location on a fitted logarithmic curve on sample points where
+  * the slope is equal to target_slope
+  * @param x_values, x values for sample points
+  * @param y_values, y values for sample points
+  * @param target_slope, the slope that we are looking for
+  * @return the x location where the slope of the curve is target_slope
+  */
+  static double find_slope_on_curve(
+    const std::vector<double> &x_values,
+    const std::vector<double> &y_values,
+    double target_slope) {
+    assert(x_values.size() == y_values.size() &&
+           "x and y values vectors should have a same size.");
+    assert(target_slope != 0 &&
+           "The target slope of zero will result to a inf x, try a nonzero value.");
+    assert(target_slope >= 0 &&
+           "The target slope for a logarithmic function should be positive.");
+    double theta[2];    // theta[0] + theta[1] * ln(x)
+    regression(x_values, y_values,
+                           theta);     // find the logarithmic function using regression
+    double target_x = theta[1] /
+                      target_slope;  // find the x where the slope is close to target_slope
+    return target_x;
+  }
+
+  /***
+  * Finds the mu and std parameters of the lognormal distribution from its mode
+  * and x boundaries.
+  * @param mode, the mode of the distribution.
+  * @param min_x, x lower boundary of distribution (zero percentile)
+  * @param max_x, x upper boundary of distribution (99th percentile)
+  * @param params, holds the calculated distribution parameters (mu and std) as
+  * output (params[0] = mu and params[1] = std)
+  */
+  static void
+  find_log_normal_dist_params(double mode, double min_x, double max_x,
+                              double params[2]) {
+    assert(min_x < max_x && "The min_x should be smaller than max_x");
+    assert(mode >= min_x && mode < max_x &&
+           "The mode should be between min_x and max_x");
+    double max_x_normalized = max_x - min_x;
+    double mode_normalized = mode - min_x;
+    double std_dev = (-Z_P + std::sqrt(
+      Z_P * Z_P + 4 * std::log(max_x_normalized) -
+      4 * std::log(mode_normalized))) / 2;
+    double mu = std::log(max_x_normalized) - Z_P * std_dev;
+    params[0] = mu;
+    params[1] = std_dev;
+  }
+}
index f006ba33a850ce9aa8687dfe4aa881db1f6e9a3a..9d59225c79b3c9492532c50be63e8a6c9db6ffc1 100644 (file)
@@ -55,6 +55,7 @@ set(alien_store_srcs
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/simple_bitmap.cc
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc
+  ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStoreSlowFastCoDel.cc
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc
   ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc
index 204a29fea8ccd17af31a265a17b7da73291a4f23..9bb6be0db3def73cf8a0a54facd8dbece8d02a48 100644 (file)
@@ -29,6 +29,7 @@ if(WITH_BLUESTORE)
     bluestore/BlueStore.cc
     bluestore/simple_bitmap.cc
     bluestore/bluestore_types.cc
+    bluestore/BlueStoreSlowFastCoDel.cc
     bluestore/fastbmap_allocator_impl.cc
     bluestore/FreelistManager.cc
     bluestore/StupidAllocator.cc
index 86062f290f0335d74012b7ff1745bb31a49a9406..a780cf925788dccbc62dd3d85906c4b346c3daff 100644 (file)
@@ -4601,6 +4601,15 @@ BlueStore::BlueStore(CephContext *cct,
   _init_logger();
   cct->_conf.add_observer(this);
   set_cache_shards(1);
+  if ( cct->_conf->bluestore_codel) {
+    codel = std::make_unique<BlueStoreSlowFastCoDel>(
+            cct, [this](int64_t x) mutable {
+                this->throttle.reset_kv_throttle_max(x);
+            },
+            [this]() mutable {
+                return this->throttle.get_kv_throttle_current();
+            });
+  }
 }
 
 BlueStore::~BlueStore()
@@ -4666,6 +4675,17 @@ const char **BlueStore::get_tracked_conf_keys() const
     "bluestore_warn_on_no_per_pool_omap",
     "bluestore_warn_on_no_per_pg_omap",
     "bluestore_max_defer_interval",
+    "bluestore_codel",
+    "bluestore_codel_slow_interval",
+    "bluestore_codel_fast_interval",
+    "bluestore_codel_initial_target_latency",
+    "bluestore_codel_min_target_latency",
+    "bluestore_codel_max_target_latency",
+    "bluestore_codel_throughput_latency_tradeoff",
+    "bluestore_codel_initial_budget_bytes",
+    "bluestore_codel_min_budget_bytes",
+    "bluestore_codel_budget_increment_bytes",
+    "bluestore_codel_regression_history_size",
     NULL
   };
   return KEYS;
@@ -4724,6 +4744,9 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf,
       changed.count("bluestore_throttle_deferred_bytes") ||
       changed.count("bluestore_throttle_trace_rate")) {
     throttle.reset_throttle(conf);
+    if (codel) {
+      codel->reset_bluestore_budget();
+    }
   }
   if (changed.count("bluestore_max_defer_interval")) {
     if (bdev) {
@@ -4736,6 +4759,21 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf,
       changed.count("osd_memory_expected_fragmentation")) {
     _update_osd_memory_options();
   }
+  if (changed.count("bluestore_codel") ||
+      changed.count("bluestore_codel_slow_interval") ||
+      changed.count("bluestore_codel_fast_interval") ||
+      changed.count("bluestore_codel_initial_target_latency") ||
+      changed.count("bluestore_codel_min_target_latency") ||
+      changed.count("bluestore_codel_max_target_latency") ||
+      changed.count("bluestore_codel_throughput_latency_tradeoff") ||
+      changed.count("bluestore_codel_initial_budget_bytes") ||
+      changed.count("bluestore_codel_min_budget_bytes") ||
+      changed.count("bluestore_codel_budget_increment_bytes") ||
+      changed.count("bluestore_codel_regression_history_size")) {
+    if (codel) {
+      codel->on_config_changed(cct);
+    }
+  }
 }
 
 void BlueStore::_set_compression()
@@ -12561,6 +12599,9 @@ void BlueStore::_txc_state_proc(TransContext *txc)
 
     case TransContext::STATE_KV_DONE:
       throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
+      if (codel) {
+        codel->update_from_txc_info(txc->txc_state_proc_start, txc->bytes);
+      }
       if (txc->deferred_txn) {
        txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
        _deferred_queue(txc);
@@ -14032,6 +14073,7 @@ int BlueStore::queue_transactions(
   logger->inc(l_bluestore_txc);
 
   // execute (start)
+  txc->txc_state_proc_start = mono_clock::now();
   _txc_state_proc(txc);
 
   if (bdev->is_smr()) {
index 0f804595ebb37cbc96551a3f5ed008781c60baa3..e35e0bd4c34401650bad7e2f35ae2f5202c0d2fd 100644 (file)
@@ -51,6 +51,7 @@
 #include "bluestore_types.h"
 #include "BlueFS.h"
 #include "common/EventTrace.h"
+#include "BlueStoreSlowFastCoDel.h"
 
 #ifdef WITH_BLKIN
 #include "common/zipkin_trace.h"
@@ -1724,6 +1725,7 @@ public:
     uint64_t seq = 0;
     ceph::mono_clock::time_point start;
     ceph::mono_clock::time_point last_stamp;
+    ceph::mono_clock::time_point txc_state_proc_start;
 
     uint64_t last_nid = 0;     ///< if non-zero, highest new nid we allocated
     uint64_t last_blobid = 0;  ///< if non-zero, highest new blobid we allocated
@@ -1900,8 +1902,16 @@ public:
       trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0;
 #endif
     }
+    int64_t get_kv_throttle_current() {
+      return throttle_bytes.get_current();
+    }
+    void reset_kv_throttle_max(int64_t m) {
+      throttle_bytes.reset_max(m);
+    }
   } throttle;
 
+    std::unique_ptr<BlueStoreSlowFastCoDel> codel;
+
   typedef boost::intrusive::list<
     TransContext,
     boost::intrusive::member_hook<
diff --git a/src/os/bluestore/BlueStoreSlowFastCoDel.cc b/src/os/bluestore/BlueStoreSlowFastCoDel.cc
new file mode 100644 (file)
index 0000000..e85a934
--- /dev/null
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "BlueStoreSlowFastCoDel.h"
+
+#include "common/regression_utils.h"
+
+BlueStoreSlowFastCoDel::BlueStoreSlowFastCoDel(
+  CephContext *_cct,
+  std::function<void(int64_t)> _bluestore_budget_reset_callback,
+  std::function<int64_t()> _get_kv_throttle_current) :
+  fast_timer(_cct, fast_timer_lock),
+  slow_timer(_cct, slow_timer_lock),
+  bluestore_budget_reset_callback(_bluestore_budget_reset_callback),
+  get_kv_throttle_current(_get_kv_throttle_current) {
+  on_config_changed(_cct);
+}
+
+BlueStoreSlowFastCoDel::~BlueStoreSlowFastCoDel() {
+  {
+    std::lock_guard l1{fast_timer_lock};
+    fast_timer.cancel_all_events();
+    fast_timer.shutdown();
+  }
+
+  {
+    std::lock_guard l2{slow_timer_lock};
+    slow_timer.cancel_all_events();
+    slow_timer.shutdown();
+  }
+
+  regression_throughput_history.clear();
+  regression_target_latency_history.clear();
+}
+
+void BlueStoreSlowFastCoDel::update_from_txc_info(
+  ceph::mono_clock::time_point txc_start_time,
+  uint64_t txc_bytes) {
+  std::lock_guard l(register_lock);
+  ceph::mono_clock::time_point now = ceph::mono_clock::now();
+  int64_t latency = std::chrono::nanoseconds(now - txc_start_time).count();
+
+  if (activated && max_queue_length < get_kv_throttle_current()) {
+    max_queue_length = get_kv_throttle_current();
+  }
+  if (min_latency == INITIAL_LATENCY_VALUE || latency < min_latency) {
+    min_latency = latency;
+  }
+  slow_interval_txc_cnt++;
+  slow_interval_registered_bytes += txc_bytes;
+}
+
+void BlueStoreSlowFastCoDel::on_min_latency_violation() {
+  if (target_latency > 0) {
+    double diff = (double) (target_latency - min_latency);
+    auto error_ratio = std::abs(diff) / min_latency;
+    if (error_ratio > 0.5) {
+      error_ratio = 0.5;
+    }
+    bluestore_budget = std::max(bluestore_budget * (1 - error_ratio),
+                                min_bluestore_budget * 1.0);
+  }
+}
+
+void BlueStoreSlowFastCoDel::on_no_violation() {
+  if (bluestore_budget < max_queue_length * 1.5) {
+    bluestore_budget = bluestore_budget + bluestore_budget_increment;
+  }
+}
+
+void BlueStoreSlowFastCoDel::on_config_changed(CephContext *cct) {
+  {
+    std::lock_guard l(register_lock);
+
+    activated = cct->_conf->bluestore_codel;
+    target_slope = cct->_conf->bluestore_codel_throughput_latency_tradeoff;
+    slow_interval = ((int64_t) cct->_conf->bluestore_codel_slow_interval) *
+            1000 * 1000;
+    initial_fast_interval = ((int64_t)
+            cct->_conf->bluestore_codel_fast_interval) * 1000 * 1000;
+    initial_target_latency = ((int64_t)
+            cct->_conf->bluestore_codel_initial_target_latency) * 1000 * 1000;
+    min_target_latency = ((int64_t)
+            cct->_conf->bluestore_codel_min_target_latency) * 1000 * 1000;
+    max_target_latency = ((int64_t)
+            cct->_conf->bluestore_codel_max_target_latency) * 1000 * 1000;
+    initial_bluestore_budget = cct->_conf->bluestore_codel_initial_budget_bytes;
+    min_bluestore_budget = cct->_conf->bluestore_codel_min_budget_bytes;
+    bluestore_budget_increment =
+            cct->_conf->bluestore_codel_budget_increment_bytes;
+    regression_history_size =
+            cct->_conf->bluestore_codel_regression_history_size;
+
+    bluestore_budget = initial_bluestore_budget;
+    min_bluestore_budget = initial_bluestore_budget;
+    max_queue_length = min_bluestore_budget;
+    fast_interval = initial_fast_interval;
+    target_latency = initial_target_latency;
+    min_latency = INITIAL_LATENCY_VALUE;
+    slow_interval_registered_bytes = 0;
+    regression_throughput_history.clear();
+    regression_target_latency_history.clear();
+    slow_interval_start = ceph::mono_clock::zero();
+  }
+
+  {
+    std::lock_guard l1{fast_timer_lock};
+    fast_timer.cancel_all_events();
+    fast_timer.init();
+  }
+  _fast_interval_process();
+  {
+    std::lock_guard l2{slow_timer_lock};
+    slow_timer.cancel_all_events();
+    slow_timer.init();
+  }
+  _slow_interval_process();
+}
+
+void BlueStoreSlowFastCoDel::reset_bluestore_budget() {
+  if (activated) {
+    bluestore_budget = std::max(min_bluestore_budget, bluestore_budget);
+    bluestore_budget_reset_callback(bluestore_budget);
+  }
+}
+
+void BlueStoreSlowFastCoDel::_fast_interval_process() {
+  std::lock_guard l(register_lock);
+  if (target_latency != INITIAL_LATENCY_VALUE &&
+      min_latency != INITIAL_LATENCY_VALUE) {
+    if (activated) {
+      if (_check_latency_violation()) {
+        // min latency violation
+        violation_count++;
+        _update_interval();
+        on_min_latency_violation(); // handle the violation
+      } else {
+        // no latency violation
+        violation_count = 0;
+        fast_interval = initial_fast_interval;
+        on_no_violation();
+      }
+      bluestore_budget = std::max(min_bluestore_budget, bluestore_budget);
+      bluestore_budget_reset_callback(bluestore_budget);
+    }
+
+    // reset interval
+    min_latency = INITIAL_LATENCY_VALUE;
+
+    on_fast_interval_finished();
+  }
+
+  auto codel_ctx = new LambdaContext(
+    [this](int r) {
+      _fast_interval_process();
+    });
+  auto interval_duration = std::chrono::nanoseconds(fast_interval);
+  fast_timer.add_event_after(interval_duration, codel_ctx);
+}
+
+void BlueStoreSlowFastCoDel::_slow_interval_process() {
+  std::lock_guard l(register_lock);
+  ceph::mono_clock::time_point now = ceph::mono_clock::now();
+  if (activated && !ceph::mono_clock::is_zero(slow_interval_start)
+      && slow_interval_txc_cnt > 0) {
+    double time_sec = nanosec_to_sec(
+      std::chrono::nanoseconds(now - slow_interval_start).count());
+
+    double slow_interval_throughput =
+      (slow_interval_registered_bytes * 1.0) / time_sec;
+    slow_interval_throughput = slow_interval_throughput / (1024.0 * 1024.0);
+    regression_target_latency_history.push_back(
+      nanosec_to_millisec(target_latency));
+    regression_throughput_history.push_back(slow_interval_throughput);
+    if (regression_target_latency_history.size() > regression_history_size) {
+      regression_target_latency_history.erase(
+        regression_target_latency_history.begin());
+      regression_throughput_history.erase(
+        regression_throughput_history.begin());
+    }
+    std::vector<double> targets;
+    std::vector<double> throughputs;
+    double target_ms = nanosec_to_millisec(initial_target_latency);
+    // If there is sufficient number of points, use the regression to find the
+    //  target_ms. Otherwise, target_ms will be initial_target_latency
+    if (regression_target_latency_history.size() >= regression_history_size) {
+      target_ms = ceph::find_slope_on_curve(
+        regression_target_latency_history,
+        regression_throughput_history,
+        target_slope);
+    }
+
+    target_latency_without_noise = millisec_to_nanosec(target_ms);
+    target_latency_without_noise = std::max(target_latency_without_noise,
+                                            min_target_latency);
+    target_latency_without_noise = std::min(target_latency_without_noise,
+                                            max_target_latency);
+    target_ms = nanosec_to_millisec(target_latency_without_noise);
+
+    // add log_normal noise
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    std::default_random_engine generator(seed);
+    double dist_params[2];
+    double rnd_std_dev = 5;
+    ceph::find_log_normal_dist_params(
+      target_ms,
+      nanosec_to_millisec(min_target_latency),
+      target_ms * rnd_std_dev,
+      dist_params);
+    std::lognormal_distribution<double> distribution(dist_params[0],
+                                                     dist_params[1]);
+
+    target_latency = millisec_to_nanosec(distribution(generator));
+    target_latency += min_target_latency;
+
+    if (target_latency < millisec_to_nanosec(target_ms)) {
+      std::uniform_real_distribution<> distr(0, 0.5);
+      target_latency = target_latency +
+                       (target_latency - millisec_to_nanosec(target_ms)) *
+                       distr(generator);
+    }
+
+    if (target_latency != INITIAL_LATENCY_VALUE) {
+      target_latency = std::max(target_latency, min_target_latency);
+      target_latency = std::min(target_latency, max_target_latency);
+    }
+
+    on_slow_interval_finished();
+  }
+
+  slow_interval_start = ceph::mono_clock::now();
+  slow_interval_registered_bytes = 0;
+  slow_interval_txc_cnt = 0;
+  max_queue_length = min_bluestore_budget;
+
+  auto codel_ctx = new LambdaContext(
+    [this](int r) {
+      _slow_interval_process();
+    });
+  auto interval_duration = std::chrono::nanoseconds(slow_interval);
+  slow_timer.add_event_after(interval_duration, codel_ctx);
+}
+
+
+/**
+* check if the min latency violate the target
+* @return true if min latency violate the target, false otherwise
+*/
+bool BlueStoreSlowFastCoDel::_check_latency_violation() {
+  if (target_latency != INITIAL_LATENCY_VALUE &&
+      min_latency != INITIAL_LATENCY_VALUE) {
+    if (min_latency > target_latency) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void BlueStoreSlowFastCoDel::_update_interval() {
+  auto sqrt = (int) std::round(std::sqrt(violation_count));
+  fast_interval = initial_fast_interval / sqrt;
+  if (fast_interval <= 0) {
+    fast_interval = 1000;
+  }
+}
+
+int64_t BlueStoreSlowFastCoDel::get_bluestore_budget() {
+  return bluestore_budget;
+}
+
+int64_t BlueStoreSlowFastCoDel::get_target_latency() {
+  return target_latency;
+}
diff --git a/src/os/bluestore/BlueStoreSlowFastCoDel.h b/src/os/bluestore/BlueStoreSlowFastCoDel.h
new file mode 100644 (file)
index 0000000..242260f
--- /dev/null
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <iostream>
+
+#include "include/Context.h"
+#include "common/Timer.h"
+#include "common/ceph_time.h"
+
+class BlueStoreSlowFastCoDel {
+public:
+  BlueStoreSlowFastCoDel(
+    CephContext *_cct,
+    std::function<void(int64_t)> _bluestore_budget_reset_callback,
+    std::function<int64_t()> _get_kv_throttle_current);
+
+  virtual ~BlueStoreSlowFastCoDel();
+
+  void on_config_changed(CephContext *cct);
+
+  void reset_bluestore_budget();
+
+  void update_from_txc_info(
+    ceph::mono_clock::time_point txc_start_time,
+    uint64_t txc_bytes);
+
+  int64_t get_bluestore_budget();
+
+  int64_t get_target_latency();
+
+  bool is_activated();
+
+protected:
+  static const int64_t INITIAL_LATENCY_VALUE = -1;
+
+  /* config values */
+  // Config value 'bluestore_codel',true if SlowFastCodel is activated
+  bool activated = false;
+  // Config value 'bluestore_codel_fast_interval', Initial interval for fast loop
+  int64_t initial_fast_interval = INITIAL_LATENCY_VALUE;
+  // Config value 'bluestore_codel_initial_target_latency', Initial target latency
+  // to start the algorithm
+  int64_t initial_target_latency = INITIAL_LATENCY_VALUE;
+  // Config value 'bluestore_codel_slow_interval', the interval for the slow loop
+  int64_t slow_interval = INITIAL_LATENCY_VALUE;
+  // Config value 'bluestore_codel_min_target_latency', min possible value for target
+  int64_t min_target_latency = INITIAL_LATENCY_VALUE;  // in ns
+  // Config value 'bluestore_codel_max_target_latency', max possible value for target
+  int64_t max_target_latency = INITIAL_LATENCY_VALUE; // in ns
+  // Config value 'bluestore_codel_throughput_latency_tradeoff', define the
+  // tradeoff between throughput and latency (MB/s loss for every 1ms latency drop)
+  double target_slope = 5;
+  // Config value 'bluestore_codel_regression_history_size', regression history size
+  int64_t regression_history_size = 100;
+  // Config value 'bluestore_codel_min_budget_bytes', the minimum bluestore
+  // throttle budget
+  int64_t min_bluestore_budget = 102400;
+  // Config value 'bluestore_codel_initial_budget_bytes', the initial bluestore
+  // throttle budget
+  int64_t initial_bluestore_budget = 102400;
+  // Config value 'bluestore_codel_budget_increment_bytes', the increment size
+  // for opening the bluestore throttle
+  int64_t bluestore_budget_increment = 102400;
+
+  /* internal state variables */
+  // current interval for the fast loop
+  int64_t fast_interval = INITIAL_LATENCY_VALUE;
+  // current target latency that fast loop is using
+  int64_t target_latency = INITIAL_LATENCY_VALUE;
+  int64_t target_latency_without_noise = INITIAL_LATENCY_VALUE;
+  // min latency in the current fast interval
+  int64_t min_latency = INITIAL_LATENCY_VALUE;
+  int64_t violation_count = 0;
+  ceph::mutex fast_timer_lock = ceph::make_mutex("CoDel::fast_timer_lock");
+  ceph::mutex slow_timer_lock = ceph::make_mutex("CoDel::slow_timer_lock");
+  ceph::mutex register_lock = ceph::make_mutex("CoDel::register_lock");
+  SafeTimer fast_timer;  // fast loop timer
+  SafeTimer slow_timer;  // slow loop timer
+  // marks the start of the current slow interval
+  ceph::mono_clock::time_point slow_interval_start = ceph::mono_clock::zero();
+  // amount of bytes that has been processed in current slow interval
+  int64_t slow_interval_registered_bytes = 0;
+  // number of transactions that has been processed in current slow interval
+  int64_t slow_interval_txc_cnt = 0;
+  // target latency history for regression
+  std::vector<double> regression_target_latency_history;
+  // throughput history for regression
+  std::vector<double> regression_throughput_history;
+  int64_t bluestore_budget = 102400;  // current bluestore throttle budget
+  // maximum amount of inflight data in current slow interval
+  int64_t max_queue_length = 102400;
+  std::function<void(int64_t)> bluestore_budget_reset_callback;
+  std::function<int64_t(void)> get_kv_throttle_current;
+
+  void on_min_latency_violation();
+
+  void on_no_violation();
+
+  virtual void on_fast_interval_finished() {}
+
+  virtual void on_slow_interval_finished() {}
+
+private:
+
+  bool _check_latency_violation();
+
+  void _update_interval();
+
+  void _fast_interval_process();
+
+  void _slow_interval_process();
+
+  template<typename T>
+  double millisec_to_nanosec(T ms) {
+    return ms * 1000.0 * 1000.0;
+  }
+
+  template<typename T>
+  double nanosec_to_millisec(T ns) {
+    return ns / (1000.0 * 1000.0);
+  }
+
+  template<typename T>
+  double nanosec_to_sec(T ns) {
+    return ns / (1000.0 * 1000.0 * 1000.0);
+  }
+};