osd: Run osd bench test to override default max osd capacity for mclock

author Sridhar Seshasayee <sseshasa@redhat.com>

Mon, 10 May 2021 09:11:54 +0000 (14:41 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Tue, 3 Aug 2021 05:54:36 +0000 (11:24 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Mon, 10 May 2021 09:11:54 +0000 (14:41 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Tue, 3 Aug 2021 05:54:36 +0000 (11:24 +0530)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index f580fb6f5645280c7357ade9585e2e64677f1340..525304726b2d4a70d41514006df6d2d2c60b2c21 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2323,9 +2323,6 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
        this);
      shards.push_back(one_shard);
    }
-
-  // override some config options if mclock is enabled on all the shards
-  maybe_override_options_for_qos();
  }
  
  OSD::~OSD()
@@ -2833,137 +2830,13 @@ will start to track new ops received afterwards.";
      cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
      cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
      cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
+    double elapsed = 0.0;
  
-    uint32_t duration = cct->_conf->osd_bench_duration;
-
-    if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
-      // let us limit the block size because the next checks rely on it
-      // having a sane value.  If we allow any block size to be set things
-      // can still go sideways.
-      ss << "block 'size' values are capped at "
-         << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
-         << " a higher value, please adjust 'osd_bench_max_block_size'";
-      ret = -EINVAL;
+    ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+    if (ret != 0) {
        goto out;
-    } else if (bsize < (int64_t) (1 << 20)) {
-      // entering the realm of small block sizes.
-      // limit the count to a sane value, assuming a configurable amount of
-      // IOPS and duration, so that the OSD doesn't get hung up on this,
-      // preventing timeouts from going off
-      int64_t max_count =
-        bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
-      if (count > max_count) {
-        ss << "'count' values greater than " << max_count
-           << " for a block size of " << byte_u_t(bsize) << ", assuming "
-           << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
-           << " for " << duration << " seconds,"
-           << " can cause ill effects on osd. "
-           << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
-           << " value if you wish to use a higher 'count'.";
-        ret = -EINVAL;
-        goto out;
-      }
-    } else {
-      // 1MB block sizes are big enough so that we get more stuff done.
-      // However, to avoid the osd from getting hung on this and having
-      // timers being triggered, we are going to limit the count assuming
-      // a configurable throughput and duration.
-      // NOTE: max_count is the total amount of bytes that we believe we
-      //       will be able to write during 'duration' for the given
-      //       throughput.  The block size hardly impacts this unless it's
-      //       way too big.  Given we already check how big the block size
-      //       is, it's safe to assume everything will check out.
-      int64_t max_count =
-        cct->_conf->osd_bench_large_size_max_throughput * duration;
-      if (count > max_count) {
-        ss << "'count' values greater than " << max_count
-           << " for a block size of " << byte_u_t(bsize) << ", assuming "
-           << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
-           << " for " << duration << " seconds,"
-           << " can cause ill effects on osd. "
-           << " Please adjust 'osd_bench_large_size_max_throughput'"
-           << " with a higher value if you wish to use a higher 'count'.";
-        ret = -EINVAL;
-        goto out;
-      }
-    }
-
-    if (osize && bsize > osize)
-      bsize = osize;
-
-    dout(1) << " bench count " << count
-            << " bsize " << byte_u_t(bsize) << dendl;
-
-    ObjectStore::Transaction cleanupt;
-
-    if (osize && onum) {
-      bufferlist bl;
-      bufferptr bp(osize);
-      bp.zero();
-      bl.push_back(std::move(bp));
-      bl.rebuild_page_aligned();
-      for (int i=0; i<onum; ++i) {
-       char nm[30];
-       snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
-       object_t oid(nm);
-       hobject_t soid(sobject_t(oid, 0));
-       ObjectStore::Transaction t;
-       t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
-       store->queue_transaction(service.meta_ch, std::move(t), NULL);
-       cleanupt.remove(coll_t(), ghobject_t(soid));
-      }
-    }
-
-    bufferlist bl;
-    bufferptr bp(bsize);
-    bp.zero();
-    bl.push_back(std::move(bp));
-    bl.rebuild_page_aligned();
-
-    {
-      C_SaferCond waiter;
-      if (!service.meta_ch->flush_commit(&waiter)) {
-       waiter.wait();
-      }
      }
  
-    utime_t start = ceph_clock_now();
-    for (int64_t pos = 0; pos < count; pos += bsize) {
-      char nm[30];
-      unsigned offset = 0;
-      if (onum && osize) {
-       snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
-       offset = rand() % (osize / bsize) * bsize;
-      } else {
-       snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
-      }
-      object_t oid(nm);
-      hobject_t soid(sobject_t(oid, 0));
-      ObjectStore::Transaction t;
-      t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
-      store->queue_transaction(service.meta_ch, std::move(t), NULL);
-      if (!onum || !osize)
-       cleanupt.remove(coll_t::meta(), ghobject_t(soid));
-    }
-
-    {
-      C_SaferCond waiter;
-      if (!service.meta_ch->flush_commit(&waiter)) {
-       waiter.wait();
-      }
-    }
-    utime_t end = ceph_clock_now();
-
-    // clean up
-    store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
-    {
-      C_SaferCond waiter;
-      if (!service.meta_ch->flush_commit(&waiter)) {
-       waiter.wait();
-      }
-    }
-
-    double elapsed = end - start;
      double rate = count / elapsed;
      double iops = rate / bsize;
      f->open_object_section("osd_bench_results");
@@ -3218,6 +3091,150 @@ will start to track new ops received afterwards.";
    on_finish(ret, ss.str(), outbl);
  }
  
+int OSD::run_osd_bench_test(
+  int64_t count,
+  int64_t bsize,
+  int64_t osize,
+  int64_t onum,
+  double *elapsed,
+  ostream &ss)
+{
+  int ret = 0;
+  uint32_t duration = cct->_conf->osd_bench_duration;
+
+  if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
+    // let us limit the block size because the next checks rely on it
+    // having a sane value.  If we allow any block size to be set things
+    // can still go sideways.
+    ss << "block 'size' values are capped at "
+       << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
+       << " a higher value, please adjust 'osd_bench_max_block_size'";
+    ret = -EINVAL;
+    return ret;
+  } else if (bsize < (int64_t) (1 << 20)) {
+    // entering the realm of small block sizes.
+    // limit the count to a sane value, assuming a configurable amount of
+    // IOPS and duration, so that the OSD doesn't get hung up on this,
+    // preventing timeouts from going off
+    int64_t max_count =
+      bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
+    if (count > max_count) {
+      ss << "'count' values greater than " << max_count
+         << " for a block size of " << byte_u_t(bsize) << ", assuming "
+         << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
+         << " for " << duration << " seconds,"
+         << " can cause ill effects on osd. "
+         << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
+         << " value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  } else {
+    // 1MB block sizes are big enough so that we get more stuff done.
+    // However, to avoid the osd from getting hung on this and having
+    // timers being triggered, we are going to limit the count assuming
+    // a configurable throughput and duration.
+    // NOTE: max_count is the total amount of bytes that we believe we
+    //       will be able to write during 'duration' for the given
+    //       throughput.  The block size hardly impacts this unless it's
+    //       way too big.  Given we already check how big the block size
+    //       is, it's safe to assume everything will check out.
+    int64_t max_count =
+      cct->_conf->osd_bench_large_size_max_throughput * duration;
+    if (count > max_count) {
+      ss << "'count' values greater than " << max_count
+         << " for a block size of " << byte_u_t(bsize) << ", assuming "
+         << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
+         << " for " << duration << " seconds,"
+         << " can cause ill effects on osd. "
+         << " Please adjust 'osd_bench_large_size_max_throughput'"
+         << " with a higher value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  }
+
+  if (osize && bsize > osize) {
+    bsize = osize;
+  }
+
+  dout(1) << " bench count " << count
+          << " bsize " << byte_u_t(bsize) << dendl;
+
+  ObjectStore::Transaction cleanupt;
+
+  if (osize && onum) {
+    bufferlist bl;
+    bufferptr bp(osize);
+    bp.zero();
+    bl.push_back(std::move(bp));
+    bl.rebuild_page_aligned();
+    for (int i=0; i<onum; ++i) {
+      char nm[30];
+      snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
+      object_t oid(nm);
+      hobject_t soid(sobject_t(oid, 0));
+      ObjectStore::Transaction t;
+      t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
+      store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+      cleanupt.remove(coll_t(), ghobject_t(soid));
+    }
+  }
+
+  bufferlist bl;
+  bufferptr bp(bsize);
+  bp.zero();
+  bl.push_back(std::move(bp));
+  bl.rebuild_page_aligned();
+
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+
+  utime_t start = ceph_clock_now();
+  for (int64_t pos = 0; pos < count; pos += bsize) {
+    char nm[30];
+    unsigned offset = 0;
+    if (onum && osize) {
+      snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
+      offset = rand() % (osize / bsize) * bsize;
+    } else {
+      snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+    }
+    object_t oid(nm);
+    hobject_t soid(sobject_t(oid, 0));
+    ObjectStore::Transaction t;
+    t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+    store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+    if (!onum || !osize) {
+      cleanupt.remove(coll_t::meta(), ghobject_t(soid));
+    }
+  }
+
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+  utime_t end = ceph_clock_now();
+  *elapsed = end - start;
+
+  // clean up
+  store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+
+ return ret;
+}
+
  class TestOpsSocketHook : public AdminSocketHook {
    OSDService *service;
    ObjectStore *store;
@@ -3767,6 +3784,10 @@ int OSD::init()
  
    start_boot();
  
+  // Override a few options if mclock scheduler is enabled.
+  maybe_override_max_osd_capacity_for_qos();
+  maybe_override_options_for_qos();
+
    return 0;
  
  out:
@@ -10084,6 +10105,53 @@ void OSD::handle_conf_change(const ConfigProxy& conf,
    }
  }
  
+void OSD::maybe_override_max_osd_capacity_for_qos()
+{
+  // If the scheduler enabled is mclock, override the default
+  // osd capacity with the value obtained from running the
+  // osd bench test. This is later used to setup mclock.
+  if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
+    // Write 200 4MiB objects with blocksize 4KiB
+    int64_t count = 12288000; // Count of bytes to write
+    int64_t bsize = 4096;     // Block size
+    int64_t osize = 4194304;  // Object size
+    int64_t onum = 100;       // Count of objects to write
+    double elapsed = 0.0;     // Time taken to complete the test
+    stringstream ss;
+    int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+    if (ret != 0) {
+      derr << __func__
+           << " osd bench err: " << ret
+           << " osd bench errstr: " << ss.str()
+           << dendl;
+    } else {
+      double rate = count / elapsed;
+      double iops = rate / bsize;
+      dout(1) << __func__
+              << " osd bench result -"
+              << std::fixed << std::setprecision(3)
+              << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
+              << " iops: " << iops
+              << " elapsed_sec: " << elapsed
+              << dendl;
+
+      // Override the appropriate config option
+      if (store_is_rotational) {
+        cct->_conf.set_val(
+          "osd_mclock_max_capacity_iops_hdd", std::to_string(iops));
+      } else {
+        cct->_conf.set_val(
+          "osd_mclock_max_capacity_iops_ssd", std::to_string(iops));
+      }
+
+      // Override the max osd capacity for all shards
+      for (auto& shard : shards) {
+        shard->update_scheduler_config();
+      }
+    }
+  }
+}
+
  bool OSD::maybe_override_options_for_qos()
  {
    // If the scheduler enabled is mclock, override the recovery, backfill
@@ -10632,6 +10700,12 @@ void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
    }
  }
  
+void OSDShard::update_scheduler_config()
+{
+  std::lock_guard l(shard_lock);
+  scheduler->update_configuration();
+}
+
  OSDShard::OSDShard(
    int id,
    CephContext *cct,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index ed1fbcff1d4b9bcf41f3683a76e325b26962731b..865f7a1ca4c01980db33a76aaa3d69f5eb4fab8c 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1079,6 +1079,7 @@ struct OSDShard {
                     std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
    void register_and_wake_split_child(PG *pg);
    void unprime_split_children(spg_t parent, unsigned old_pg_num);
+  void update_scheduler_config();
  
    OSDShard(
      int id,
@@ -2062,7 +2063,14 @@ private:
    float get_osd_snap_trim_sleep();
  
    int get_recovery_max_active();
+  void maybe_override_max_osd_capacity_for_qos();
    bool maybe_override_options_for_qos();
+  int run_osd_bench_test(int64_t count,
+                         int64_t bsize,
+                         int64_t osize,
+                         int64_t onum,
+                         double *elapsed,
+                         std::ostream& ss);
  
    void scrub_purged_snaps();
    void probe_smart(const std::string& devid, std::ostream& ss);
diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h

index 0c647c95114bd56df8b44651ff892f947040b1d1..6e2bb5abd829f1cec2e65f319ca62fce2f4620ab 100644 (file)
--- a/src/osd/scheduler/OpScheduler.h
+++ b/src/osd/scheduler/OpScheduler.h
@@ -50,6 +50,9 @@ public:
    // Print human readable brief description with relevant parameters
    virtual void print(std::ostream &out) const = 0;
  
+  // Apply config changes to the scheduler (if any)
+  virtual void update_configuration() = 0;
+
    // Destructor
    virtual ~OpScheduler() {};
  };
@@ -134,6 +137,10 @@ public:
      out << ", cutoff=" << cutoff << ")";
    }
  
+  void update_configuration() final {
+    // no-op
+  }
+
    ~ClassedOpQueueScheduler() final {};
  };
  
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc

index e34c4094e0ca13fd52edaecd8caf168b7dc6c1fd..f2f0ffc3dbd4635430f213ed1b12912fcfc2113e 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.cc
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -109,7 +109,9 @@ void mClockScheduler::set_max_osd_capacity()
    // Set per op-shard iops limit
    max_osd_capacity /= num_shards;
    dout(1) << __func__ << " #op shards: " << num_shards
-          << " max osd capacity(iops) per shard: " << max_osd_capacity << dendl;
+          << std::fixed << std::setprecision(2)
+          << " max osd capacity(iops) per shard: " << max_osd_capacity
+          << dendl;
  }
  
  void mClockScheduler::set_osd_mclock_cost_per_io()
@@ -132,7 +134,8 @@ void mClockScheduler::set_osd_mclock_cost_per_io()
      }
    }
    dout(1) << __func__ << " osd_mclock_cost_per_io: "
-          << std::fixed << osd_mclock_cost_per_io << dendl;
+          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
+          << dendl;
  }
  
  void mClockScheduler::set_osd_mclock_cost_per_byte()
@@ -155,7 +158,8 @@ void mClockScheduler::set_osd_mclock_cost_per_byte()
      }
    }
    dout(1) << __func__ << " osd_mclock_cost_per_byte: "
-          << std::fixed << osd_mclock_cost_per_byte << dendl;
+          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
+          << dendl;
  }
  
  void mClockScheduler::set_mclock_profile()
@@ -373,6 +377,14 @@ int mClockScheduler::calc_scaled_cost(int item_cost)
    return std::max(scaled_cost, 1);
  }
  
+void mClockScheduler::update_configuration()
+{
+  // Apply configuration change. The expectation is that
+  // at least one of the tracked mclock config option keys
+  // is modified before calling this method.
+  cct->_conf.apply_changes(nullptr);
+}
+
  void mClockScheduler::dump(ceph::Formatter &f) const
  {
  }
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h

index faee092f665d2bccf59dad4a9476e530c752d901..32f3851ec0558cf218c17e7b212811390b307070 100644 (file)
--- a/src/osd/scheduler/mClockScheduler.h
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -193,6 +193,9 @@ public:
      ostream << "mClockScheduler";
    }
  
+  // Update data associated with the modified mclock config key(s)
+  void update_configuration() final;
+
    const char** get_tracked_conf_keys() const final;
    void handle_conf_change(const ConfigProxy& conf,
                           const std::set<std::string> &changed) final;
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Mon, 10 May 2021 09:11:54 +0000 (14:41 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Tue, 3 Aug 2021 05:54:36 +0000 (11:24 +0530)
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/scheduler/OpScheduler.h		patch \| blob \| history
src/osd/scheduler/mClockScheduler.cc		patch \| blob \| history
src/osd/scheduler/mClockScheduler.h		patch \| blob \| history