From 74925d762144d2b7ac8a5682db66cdbeb4fe2dae Mon Sep 17 00:00:00 2001
From: Sridhar Seshasayee <sseshasa@redhat.com>
Date: Mon, 23 Jun 2025 13:55:18 +0530
Subject: [PATCH] src/ceph_osd, osd: Implement running benchmark during OSD
 creation

The changeset implements the following:

1. Adds an option called "--run-benchmark" to the ceph-osd process to enable
   running the existing osd benchmark during the creation of the OSD but before
   it's added to the cluster. If the option is specified, the benchmark is run
   just after 'mkfs' is performed on the OSD.
2. A separate OSDBenchTest class is created that can be instantiated when the
   benchmark is needed to be run. The class has interfaces to perform precheck,
   prefill, run the actual randwrite test and do the cleanup. It also has
   interfaces to extract the metrics. The new class in used where applicable
   and maintains the existing behavior of the "bench" command.
3. The vstart.sh script is modified to run the benchmark and store the results
   for each OSD in ceph.conf. This currently prevents the execution of the
   existing benchmark during the OSD boot-up.

Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
Fixes: https://tracker.ceph.com/issues/71941
---
 src/ceph_osd.cc |  21 ++
 src/osd/OSD.cc  | 540 ++++++++++++++++++++++++++++++++++--------------
 src/osd/OSD.h   | 166 +++++++++++++++
 src/vstart.sh   |  27 +++
 4 files changed, 600 insertions(+), 154 deletions(-)
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 52988843c83..34737f29360 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -113,6 +113,7 @@ static void usage()
        << "  --debug_osd <N>   set debug level (e.g. 10)\n"
        << "  --get-device-fsid PATH\n"
        << "                    get OSD fsid for the given block device\n"
+       << "  --run-benchmark   run a throughput benchmark test against the OSD and dump the result\n"
        << std::endl;
   generic_server_usage();
 }
@@ -151,6 +152,7 @@ int main(int argc, const char **argv)
   bool get_cluster_fsid = false;
   bool get_journal_fsid = false;
   bool get_device_fsid = false;
+  bool run_benchmark = false;
   string device_path;
   std::string dump_pg_log;
   std::string osdspec_affinity;
@@ -190,6 +192,8 @@ int main(int argc, const char **argv)
     } else if (ceph_argparse_witharg(args, i, &device_path,
 				     "--get-device-fsid", (char*)NULL)) {
       get_device_fsid = true;
+    } else if (ceph_argparse_flag(args, i, "--run-benchmark", (char*)NULL)) {
+      run_benchmark = true;
     } else {
       ++i;
     }
@@ -380,6 +384,23 @@ int main(int argc, const char **argv)
   if (mkkey) {
     forker.exit(0);
   }
+  // Run a benchmark if specified
+  if (run_benchmark) {
+    store->mount();
+    tl::expected<std::string, int> res =
+      OSD::run_osd_bench(g_ceph_context, store.get());
+    if (!res.has_value()) {
+      int ret = res.error();
+      derr << TEXT_RED << " ** ERROR: error running benchmark: "
+           << cpp_strerror(ret) << TEXT_NORMAL << dendl;
+      cerr << " ** ERROR: error running benchmark: "
+           << cpp_strerror(ret) << std::endl;
+      forker.exit(ret);
+    }
+    cout << res.value() << std::endl;
+    store->umount();
+    forker.exit(0);
+  }
   if (mkjournal) {
     common_init_finish(g_ceph_context);
     int err = store->mkjournal();
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index a676804d5db..c689d5f9741 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2228,6 +2228,54 @@ int OSD::mkfs(CephContext *cct,
   return ret;
 }
 
+tl::expected<std::string, int>
+OSD::run_osd_bench(CephContext *cct,
+                   ObjectStore *store)
+{
+  // Bench test details:
+  //  1. Prefill 100 4 MiB objects with blocksize 4 KiB
+  //  2. Write to random offsets within a randomly selected object
+  //     prefilled above with block size 4 KiB.
+  //  3. Repeat step 2 until writing the count of Bytes.
+  constexpr int64_t count = 12288000; // Count of bytes to write
+  constexpr int64_t bsize = 4096;     // Block size
+  constexpr int64_t osize = 4194304;  // Object size
+  constexpr int64_t onum = 100;       // Count of objects to write
+
+  ObjectStore::CollectionHandle ch =
+    store->open_collection(coll_t::meta());
+
+  OSDBenchTest osd_bench{cct, store, ch, count, bsize, osize, onum};
+
+  int ret = osd_bench.run_test();
+  if (ret != 0) {
+    return tl::unexpected(ret);
+  }
+
+  // Format the result in json format
+  std::string result;
+  auto f = Formatter::create_unique("json");
+  if (f) {
+    bufferlist out;
+    f->open_object_section("osd_bench_results");
+    f->dump_int("status", ret);
+    f->dump_int("bytes_written", count);
+    f->dump_int("blocksize", bsize);
+    f->dump_float("prefill_time", osd_bench.get_prefill_time());
+    f->dump_float("elapsed_sec", osd_bench.get_elapsed_time());
+    f->dump_float("bytes_per_sec", osd_bench.get_bandwidth_rate());
+    f->dump_float("iops", osd_bench.get_iops_rate());
+    f->dump_int("is_rotational", store->is_rotational() ? 1 : 0);
+    f->close_section();
+    f->flush(out);
+    result = std::string(out.c_str(), out.length());
+  } else {
+    return tl::unexpected(-1);
+  }
+
+  return result;
+}
+
 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
 {
   char val[80];
@@ -3429,149 +3477,17 @@ int OSD::run_osd_bench_test(
   ostream &ss)
 {
   int ret = 0;
-  srand(time(NULL) % (unsigned long) -1);
-  uint32_t duration = cct->_conf->osd_bench_duration;
 
-  if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
-    // let us limit the block size because the next checks rely on it
-    // having a sane value.  If we allow any block size to be set things
-    // can still go sideways.
-    ss << "block 'size' values are capped at "
-       << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
-       << " a higher value, please adjust 'osd_bench_max_block_size'";
-    ret = -EINVAL;
+  OSDBenchTest osd_bench{cct, store.get(), service.meta_ch,
+                         count, bsize, osize, onum};
+  ret = osd_bench.run_test();
+  if (ret != 0) {
+    ss << osd_bench.get_errstr();
     return ret;
-  } else if (bsize < (int64_t) (1 << 20)) {
-    // entering the realm of small block sizes.
-    // limit the count to a sane value, assuming a configurable amount of
-    // IOPS and duration, so that the OSD doesn't get hung up on this,
-    // preventing timeouts from going off
-    int64_t max_count =
-      bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
-    if (count > max_count) {
-      ss << "'count' values greater than " << max_count
-         << " for a block size of " << byte_u_t(bsize) << ", assuming "
-         << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
-         << " for " << duration << " seconds,"
-         << " can cause ill effects on osd. "
-         << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
-         << " value if you wish to use a higher 'count'.";
-      ret = -EINVAL;
-      return ret;
-    }
-  } else {
-    // 1MB block sizes are big enough so that we get more stuff done.
-    // However, to avoid the osd from getting hung on this and having
-    // timers being triggered, we are going to limit the count assuming
-    // a configurable throughput and duration.
-    // NOTE: max_count is the total amount of bytes that we believe we
-    //       will be able to write during 'duration' for the given
-    //       throughput.  The block size hardly impacts this unless it's
-    //       way too big.  Given we already check how big the block size
-    //       is, it's safe to assume everything will check out.
-    int64_t max_count =
-      cct->_conf->osd_bench_large_size_max_throughput * duration;
-    if (count > max_count) {
-      ss << "'count' values greater than " << max_count
-         << " for a block size of " << byte_u_t(bsize) << ", assuming "
-         << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
-         << " for " << duration << " seconds,"
-         << " can cause ill effects on osd. "
-         << " Please adjust 'osd_bench_large_size_max_throughput'"
-         << " with a higher value if you wish to use a higher 'count'.";
-      ret = -EINVAL;
-      return ret;
-    }
-  }
-
-  if (osize && bsize > osize) {
-    bsize = osize;
-  }
-
-  dout(0) << " bench count " << count
-          << " bsize " << byte_u_t(bsize)
-          << " onum " << onum
-          << " osize " << byte_u_t(osize)
-          << dendl;
-
-  ObjectStore::Transaction cleanupt;
-  utime_t start = ceph_clock_now();
-
-  if (osize && onum) {
-    bufferlist bl;
-    bufferptr bp(osize);
-    memset(bp.c_str(), 'a', bp.length());
-    bl.push_back(std::move(bp));
-    bl.rebuild_page_aligned();
-    for (int i=0; i<onum; ++i) {
-      char nm[30];
-      snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
-      object_t oid(nm);
-      hobject_t soid(sobject_t(oid, 0));
-      ObjectStore::Transaction t;
-      t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
-      store->queue_transaction(service.meta_ch, std::move(t), nullptr);
-      cleanupt.remove(coll_t(), ghobject_t(soid));
-    }
-  }
-
-  {
-    C_SaferCond waiter;
-    if (!service.meta_ch->flush_commit(&waiter)) {
-      waiter.wait();
-    }
-  }
-  dout(0) << __func__
-          << " prefill took " << ceph_clock_now() - start
-          << dendl;
-
-
-  start = ceph_clock_now();
-  bufferlist bl;
-  for (int64_t pos = 0; pos < count; pos += bsize) {
-    char nm[34];
-    unsigned offset = 0;
-    bufferptr bp(bsize);
-    memset(bp.c_str(), rand() & 0xff, bp.length());
-    bl.push_back(std::move(bp));
-    bl.rebuild_page_aligned();
-    if (onum && osize) {
-      snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
-      offset = rand() % (osize / bsize) * bsize;
-    } else {
-      snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
-    }
-    object_t oid(nm);
-    hobject_t soid(sobject_t(oid, 0));
-    ObjectStore::Transaction t;
-    t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
-    store->queue_transaction(service.meta_ch, std::move(t), nullptr);
-    if (!onum || !osize) {
-      cleanupt.remove(coll_t::meta(), ghobject_t(soid));
-    }
-    bl.clear();
-  }
-
-  {
-    C_SaferCond waiter;
-    if (!service.meta_ch->flush_commit(&waiter)) {
-      waiter.wait();
-    }
   }
-  utime_t end = ceph_clock_now();
-  *elapsed = end - start;
-  dout(0) << __func__
-          << " benchmark took " << *elapsed
-          << dendl;
 
-  // clean up
-  store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
-  {
-    C_SaferCond waiter;
-    if (!service.meta_ch->flush_commit(&waiter)) {
-      waiter.wait();
-    }
-  }
+  // get elapsed time
+  *elapsed = osd_bench.get_elapsed_time();
 
  return ret;
 }
@@ -10295,28 +10211,17 @@ void OSD::maybe_override_max_osd_capacity_for_qos()
     int64_t bsize = 4096;     // Block size
     int64_t osize = 4194304;  // Object size
     int64_t onum = 100;       // Count of objects to write
-    double elapsed = 0.0;     // Time taken to complete the test
-    double iops = 0.0;
-    stringstream ss;
-    int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+    OSDBenchTest osd_bench{cct, store.get(), service.meta_ch,
+                           count, bsize, osize, onum};
+    int ret = osd_bench.run_test();
     if (ret != 0) {
       derr << __func__
            << " osd bench err: " << ret
-           << " osd bench errstr: " << ss.str()
+           << " osd bench errstr: " << osd_bench.get_errstr()
            << dendl;
       return;
     }
 
-    double rate = count / elapsed;
-    iops = rate / bsize;
-    dout(1) << __func__
-            << " osd bench result -"
-            << std::fixed << std::setprecision(3)
-            << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
-            << " iops: " << iops
-            << " elapsed_sec: " << elapsed
-            << dendl;
-
     // Get the threshold IOPS set for the underlying hdd/ssd.
     double hi_threshold_iops = 0.0;
     double lo_threshold_iops = 0.0;
@@ -10335,6 +10240,7 @@ void OSD::maybe_override_max_osd_capacity_for_qos()
     // Persist the iops value to the MON store or throw cluster warning
     // if the measured iops is not in the threshold range. If the iops is
     // not within the threshold range, the current/default value is retained.
+    double iops = osd_bench.get_iops_rate();
     if (iops < lo_threshold_iops || iops > hi_threshold_iops) {
       clog->warn() << "OSD bench result of " << std::to_string(iops)
                    << " IOPS is not within the threshold limit range of "
@@ -11534,6 +11440,332 @@ void OSD::ShardedOpWQ::stop_for_fast_shutdown()
   }
 }
 
+// =============================================================
+
+#undef dout_context
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix *_dout << "OSDBenchTest: "
+
+/**
+ * Perform multiple pre-checks before initiating the test
+ *  - Validate the store and meta collection
+ *  - Validate input parameters and associated limits
+ *    (see comments below for more details)
+ */
+int OSDBenchTest::precheck()
+{
+  int ret = 0;
+
+  if (!store) {
+    derr << "OSDBenchTest: objectstore not specified!" << dendl;
+    errmsg << "ObjectStore not found!";
+    ret = -ENOENT;
+    return ret;
+  }
+
+  if (!ch) {
+    derr << "OSDBenchTest: meta collection not specified!" << dendl;
+    errmsg << "Meta collection on the ObjectStore not found!";
+    ret = -ENOENT;
+    return ret;
+  }
+
+  // The minimum requirement to run the test is:
+  //  - count of Bytes to write and
+  //  - block size
+  if (!count || !bsize) {
+    errmsg << "block size and/or count of Bytes to write not specified";
+    ret = -EINVAL;
+    return ret;
+  }
+
+  uint32_t duration = cct->_conf->osd_bench_duration;
+
+  if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
+    // let us limit the block size because the next checks rely on it
+    // having a sane value.  If we allow any block size to be set things
+    // can still go sideways.
+    errmsg << "block 'size' values are capped at "
+           << byte_u_t(cct->_conf->osd_bench_max_block_size)
+           << ". If you wish to use a higher value, please adjust"
+           << " 'osd_bench_max_block_size'";
+    ret = -EINVAL;
+    return ret;
+  } else if (bsize < (int64_t) (1 << 20)) {
+    // entering the realm of small block sizes.
+    // limit the count to a sane value, assuming a configurable amount of
+    // IOPS and duration, so that the OSD doesn't get hung up on this,
+    // preventing timeouts from going off
+    int64_t max_count =
+      bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
+    if (count > max_count) {
+      errmsg << "'count' values greater than " << max_count
+             << " for a block size of " << byte_u_t(bsize) << ", assuming "
+             << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
+             << " for " << duration << " seconds,"
+             << " can cause ill effects on osd. "
+             << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
+             << " value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  } else {
+    // 1MB block sizes are big enough so that we get more stuff done.
+    // However, to avoid the osd from getting hung on this and having
+    // timers being triggered, we are going to limit the count assuming
+    // a configurable throughput and duration.
+    // NOTE: max_count is the total amount of bytes that we believe we
+    //       will be able to write during 'duration' for the given
+    //       throughput.  The block size hardly impacts this unless it's
+    //       way too big.  Given we already check how big the block size
+    //       is, it's safe to assume everything will check out.
+    int64_t max_count =
+      cct->_conf->osd_bench_large_size_max_throughput * duration;
+    if (count > max_count) {
+      errmsg << "'count' values greater than " << max_count
+             << " for a block size of " << byte_u_t(bsize) << ", assuming "
+             << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput)
+             << "/s," << " for " << duration << " seconds,"
+             << " can cause ill effects on osd. "
+             << " Please adjust 'osd_bench_large_size_max_throughput'"
+             << " with a higher value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  }
+
+  if (osize && bsize > osize) {
+    dout(0) << fmt::format(
+                   "{}: bsize: {} is greater than osize: {}. Running test by"
+                   " overriding bsize to {}.", __func__, byte_u_t(bsize),
+                   byte_u_t(osize), byte_u_t(osize))
+            << dendl;
+    bsize = osize;
+  }
+
+  return ret;
+}
+
+/**
+ * Run a bench test.
+ *
+ * Run a bench test based on the set parameters. The test performs
+ *  - Prechecks to ensure the minimum requirements are satisified
+ *  - Flushes the objectstore cache
+ *  - Prefill the objectstore if object size('osize') and number of
+ *    objects('onum') are specified
+ *  - Perform the writes ('count' of bytes) to the test objects in
+ *    'bsize' chunks based on the passed parameters
+ *  - Clean-up: All the objects written for the test are cleaned up.
+ *  - Calculate and set the 'bandwidth' and 'iops'.
+ */
+int OSDBenchTest::run_test()
+{
+  int ret = precheck();
+  if (ret != 0) {
+    return ret;
+  }
+
+  dout(0) << fmt::format(
+                 "{}: running osd bench with "
+                 "count: {} Bytes bsize: {} onum: {} osize: {}",
+                 __func__, count, byte_u_t(bsize), onum, byte_u_t(osize))
+          << dendl;
+
+  // flush store cache
+  ret = flush_store_cache();
+  if (ret != 0) {
+    errmsg << "Error flushing objectstore cache: " << cpp_strerror(ret);
+    return ret;
+  }
+
+  // Prefill
+  prefill_objects();
+
+  // write test
+  perform_write_test();
+
+  // cleanup
+  cleanup();
+
+  // Calculate bandwidth & iops
+  if (elapsed && bsize) {
+    bandwidth = count / elapsed;
+    iops = bandwidth / bsize;
+    dout(0) << fmt::format(
+                   "{}: osd bench result - "
+                   "bandwidth: {}/s iops: {:.2f} elapsed_sec: {:.2f}",
+                    __func__, byte_u_t(bandwidth), iops, elapsed)
+            << dendl;
+  } else {
+    ret = -EIO;
+    errmsg << "Unable to determine bench result."
+           << " elapsed time: " << elapsed
+           << " bsize: " << bsize;
+  }
+
+  return ret;
+}
+
+/**
+ * Flush and commit writes to the ObjectStore
+ */
+void OSDBenchTest::wait_for_flush_commit()
+{
+  C_SaferCond waiter;
+  if (!ch->flush_commit(&waiter)) {
+    waiter.wait();
+  }
+}
+
+/**
+ * Prefill objects for the bench test
+ *
+ * The prefill phase is contingent on 'osize' and 'onum' as
+ * described below:
+ *
+ * case 1:
+ * If both object size ('osize') and number of objects ('onum')
+ * are specified, then the objectstore is prefilled. Prefilling
+ * is done to allow performing writes to random offsets within
+ * an object.
+ *
+ * case 2:
+ * If both 'osize' and 'onum' are not specified(set to 0), the
+ * prefill step is skipped. The test later on instead creates
+ * new objects and writes to them from offset 0 (sequential).
+ */
+void OSDBenchTest::prefill_objects()
+{
+  if (osize && onum) {
+    utime_t start = ceph_clock_now();
+    bufferptr bp(osize);
+    memset(bp.c_str(), 'a', bp.length());
+    bufferlist bl = bufferlist::static_from_mem(bp.c_str(), osize);
+    bl.rebuild_page_aligned();
+    for (int i = 0; i < onum; ++i) {
+      std::string nm = fmt::format("disk_bw_test_{}", i);
+      object_t oid(nm);
+      hobject_t soid(sobject_t(oid, 0));
+      ObjectStore::Transaction t;
+      t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
+      store->queue_transaction(ch, std::move(t), nullptr);
+      cleanupt.remove(coll_t(), ghobject_t(soid));
+    }
+
+    wait_for_flush_commit();
+
+    prefill_time = ceph_clock_now() - start;
+    dout(0) << fmt::format(
+                   "{}: Prefill took {:.2f} secs.",
+                   __func__, prefill_time)
+            << dendl;
+  } else {
+    dout(0) << fmt::format("{}: Prefill skipped.", __func__) << dendl;
+  }
+}
+
+/**
+ * Perform bench write test
+ *
+ * There are some key differences in the way writes are performed
+ * based on the specification of 'osize' and 'onum'.
+ *
+ * case 1:
+ * If object size ('osize') and number of objects ('onum') are
+ * specified, writes are performed starting at random offsets on
+ * each randomly selected object from the prefilled set.
+ *
+ * case 2:
+ * If 'osize' and 'onum' are not specified(set to 0), writes are
+ * performed starting at offset 0 on a new object.
+ *
+ * The test writes the specified 'count' of bytes in 'bsize'
+ * chunks. Note that in case 2 the object size will be equal to
+ * the block size.
+ */
+void OSDBenchTest::perform_write_test()
+{
+  std::mt19937 random_gen(std::random_device{}());
+  bufferlist bl;
+
+  utime_t start = ceph_clock_now();
+  for (int64_t bytes_written = 0;
+       bytes_written < count;
+       bytes_written += bsize) {
+    std::string nm;
+    unsigned offset = 0;
+    bufferptr bp(bsize);
+    memset(bp.c_str(), random_gen() & 0xff, bp.length());
+    bl.push_back(std::move(bp));
+    bl.rebuild_page_aligned();
+    if (onum && osize) {
+      nm = fmt::format("disk_bw_test_{}", (int)(random_gen() % onum));
+      offset = random_gen() % (osize / bsize) * bsize;
+    } else {
+      nm = fmt::format("disk_bw_test_{}", (long long)bytes_written);
+    }
+    object_t oid(nm);
+    hobject_t soid(sobject_t(oid, 0));
+    // The write transaction relies on the ObjectStore's
+    // internal throttling implementation.
+    ObjectStore::Transaction t;
+    t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+    store->queue_transaction(ch, std::move(t), nullptr);
+    if (!onum || !osize) {
+      cleanupt.remove(coll_t::meta(), ghobject_t(soid));
+    }
+    bl.clear();
+  }
+
+  wait_for_flush_commit();
+
+  elapsed = ceph_clock_now() - start;
+  dout(0) << fmt::format(
+                 "{}: Test took {:.2f} secs.",
+                 __func__, elapsed)
+          << dendl;
+}
+
+/**
+ * Initiates the transaction to cleanup all the test objects
+ */
+void OSDBenchTest::cleanup()
+{
+  store->queue_transaction(ch, std::move(cleanupt), nullptr);
+  wait_for_flush_commit();
+  dout(0) << fmt::format("{}: Clean-up done.", __func__) << dendl;
+}
+
+OSDBenchTest::OSDBenchTest(
+  CephContext *cct,
+  ObjectStore *store,
+  ObjectStore::CollectionHandle& ch,
+  int64_t count,
+  int64_t bsize,
+  int64_t osize,
+  int64_t onum)
+  : cct(cct),
+    store(store),
+    ch(ch),
+    count(count),
+    bsize(bsize),
+    osize(osize),
+    onum(onum),
+    prefill_time(0.0),
+    elapsed(0.0),
+    bandwidth(0.0),
+    iops(0.0)
+{
+  dout(0) << fmt::format(
+                 "OSD Bench Test Params:"
+                 " count: {} Bytes block size: {}"
+                 " number of objects: {} object size: {}",
+                 count, byte_u_t(bsize), onum, byte_u_t(osize))
+          << dendl;
+}
+
 namespace ceph::osd_cmds {
 
 int heap(CephContext& cct,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 84f70c86a40..6dfee8390cc 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1065,6 +1065,169 @@ struct OSDShard {
     unsigned osd_op_queue_cut_off);
 };
 
+struct OSDBenchTest {
+  CephContext *cct;
+  ObjectStore *store;
+  ObjectStore::CollectionHandle ch;
+
+  // Input parameters
+  int64_t count;
+  int64_t bsize;
+  int64_t osize;
+  int64_t onum;
+
+  // Test metrics
+  double prefill_time;
+  double elapsed;
+  double bandwidth;
+  double iops;
+  std::ostringstream errmsg;
+
+  // Transaction to clean-up test objects
+  ObjectStore::Transaction cleanupt;
+
+  /**
+   * run_test()
+   *
+   * Run a bench test based on the set parameters.
+   *  - Test sequential writes to objects (from offset 0) if
+   *    only 'count' and 'bsize' are specified.
+   *  - Test random writes to objects if all input parameters
+   *    are specified.
+   *
+   * @return 0 on success. -ENOENT, -EINVAL or -EIO otherwise.
+   *
+   */
+  int run_test();
+
+  /**
+   * precheck()
+   *
+   * Performs multiple pre-checks for proper test execution.
+   * Note: Sets an error message in case of any failure which
+   *       can be accessed using get_errstr().
+   *
+   * @return 0 on success. -ENOENT or -EINVAL otherwise.
+   */
+  int precheck();
+
+  /**
+   * prefill_objects()
+   *
+   * Prefill objects for the write test only if
+   * 'osize' and 'onum' are specified.
+   */
+  void prefill_objects();
+
+  /**
+   * perform_write_test()
+   *
+   * Performs one of the following write tests based on the set
+   * parameters:
+   *  - write to a new object from offset 0, or,
+   *  - write to an already prefilled random object
+   *    (from a random offset).
+   */
+  void perform_write_test();
+
+  /**
+   * wait_for_flush_commit()
+   *
+   * Waits until all the writes to the ObjectStore are flushed
+   * and committed.
+   */
+  void wait_for_flush_commit();
+
+  /**
+   * cleanup()
+   *
+   * Removes all the objects that were created during the write test
+   * as part of a single transaction.
+   */
+  void cleanup();
+
+  /**
+   * flush_store_cache()
+   *
+   * Flushes the ObjectStore cache.
+   * Called before starting an ObjectStore transaction.
+   *
+   * @return 0 on successful flush.
+   */
+  int flush_store_cache() {
+    return store->flush_cache();
+  }
+
+  /**
+   * get_elapsed_time()
+   *
+   * Returns the time taken for the write test
+   * (including flush & commit).
+   *
+   * @return double indicating time taken for the test
+   */
+  double get_elapsed_time() {
+    return elapsed;
+  }
+
+  /**
+   * get_prefill_time()
+   *
+   * Returns the time taken to prefill objects for the test
+   * (including flush & commit).
+   *
+   * @return double indicating time taken for the prefill
+   */
+  double get_prefill_time() {
+    return prefill_time;
+  }
+
+  /**
+   * get_bandwidth_rate()
+   *
+   * Returns the bandwidth rate calculated during the test.
+   * @see run_test() implementation
+   *
+   * @return double indicating bandwidth measured during the test
+   */
+  double get_bandwidth_rate() {
+    return bandwidth;
+  }
+
+  /**
+   * get_iops_rate()
+   *
+   * Returns the iops rate calculated during the test.
+   * @see run_test() implementation
+   *
+   * @return double indicating the iops measured during the test
+   */
+  double get_iops_rate() {
+    return iops;
+  }
+
+  /**
+   * get_errstr()
+   *
+   * Returns the error message (if any) encountered at any stage during
+   * the course of the test.
+   *
+   * @return string indicating error(if any) during the test. Empty otherwise.
+   */
+  std::string get_errstr() {
+    return errmsg.str();
+  }
+
+  OSDBenchTest(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    int64_t count,
+    int64_t bsize,
+    int64_t osize,
+    int64_t onum);
+};
+
 class OSD : public Dispatcher,
 	    public md_config_obs_t {
   using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
@@ -2010,6 +2173,9 @@ private:
 		  int whoami,
 		  std::string osdspec_affinity);
 
+  static tl::expected<std::string, int>
+    run_osd_bench(CephContext *cct, ObjectStore *store);
+
   /* remove any non-user xattrs from a std::map of them */
   void filter_xattrs(std::map<std::string, ceph::buffer::ptr>& attrs) {
     for (std::map<std::string, ceph::buffer::ptr>::iterator iter = attrs.begin();
diff --git a/src/vstart.sh b/src/vstart.sh
index 63c23f38d84..43af9c01b39 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -388,6 +388,9 @@ crimson_alienstore_physical_only=0
 crimson_balance_cpu="" # "osd", "socket"
 crimson_poll_mode=false
 
+# default value for the benchmark option
+run_benchmark=0
+
 while [ $# -ge 1 ]; do
 case $1 in
     -d | --debug)
@@ -444,6 +447,10 @@ case $1 in
         ;;
     --osd-args)
         extra_osd_args="$2"
+        if [ "$extra_osd_args" == "--run-benchmark" ]; then
+            run_benchmark=1
+            extra_osd_args=""
+        fi
         shift
         ;;
     --msgr1)
@@ -1333,6 +1340,26 @@ EOF
         key = $OSD_SECRET
 EOF
         fi
+        # Run the osd benchmark if requested
+        if [ "$run_benchmark" -eq 1 ]; then
+            echo "running $SUDO $CEPH_BIN/$ceph_osd --run-benchmark -i $osd $ARGS"
+            osd_bench_result=$($SUDO $CEPH_BIN/$ceph_osd --run-benchmark -i $osd $ARGS)
+            echo "osd_bench_result: $osd_bench_result"
+            local run_status=$(echo "$osd_bench_result" | jq -r '.status')
+            if [ "$run_status" == "0" ]; then
+                local iops=$(echo "$osd_bench_result" | jq -r '.iops')
+                local is_rotational=$(echo "$osd_bench_result" | jq -r '.is_rotational')
+                if [ "$is_rotational" -eq 1 ]; then
+                    wconf <<EOF
+        osd mclock max capacity iops hdd = $iops
+EOF
+                else
+                    wconf <<EOF
+        osd mclock max capacity iops ssd = $iops
+EOF
+                fi
+            fi
+        fi
         echo start osd.$osd
         local osd_pid
         echo 'osd' $osd $SUDO $CEPH_BIN/$ceph_osd \
-- 
2.39.5