]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSD: Log aggregated slow ops detail to cluster logs 45154/head
authorPrashant D <pdhange@redhat.com>
Fri, 29 Oct 2021 13:09:24 +0000 (09:09 -0400)
committerPonnuvel Palaniyappan <pponnuvel@gmail.com>
Fri, 20 May 2022 08:12:04 +0000 (09:12 +0100)
Slow requests can overwhelm a cluster log with every slow op in
detail and also fills up the monitor db. Instead, log slow ops
details in aggregated format.

Fixes: https://tracker.ceph.com/issues/52424
Signed-off-by: Prashant D <pdhange@redhat.com>
(cherry picked from commit 9319dc9273b36dc4f4bef1261b3c57690336a8cc)

Conflicts:
    src/common/options/osd.yaml.in
- Octopus doesn't have osd.yaml.in; added the option in src/common/options.cc

src/common/legacy_config_opts.h
src/common/options.cc
src/osd/OSD.cc
src/osd/OpRequest.h

index 832af68a8ef477dbccb6ec4153404faaea43f0e5..b115363860654117952aac2a993bde388a9f05cd 100644 (file)
@@ -536,6 +536,9 @@ OPTION(osd_agent_min_evict_effort, OPT_FLOAT)
 OPTION(osd_agent_quantize_effort, OPT_FLOAT)
 OPTION(osd_agent_delay_time, OPT_FLOAT)
 
+// Allow OSD daemon to send an aggregated slow ops to the cluster log
+OPTION(osd_aggregated_slow_ops_logging, OPT_BOOL)
+
 // osd ignore history.last_epoch_started in find_best_info
 OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL)
 
index 2397748e791f25f8daa15a031a9c69cf99be5c8c..9467b6fafcba89cddb431f178ca59647015f56e9 100644 (file)
@@ -2527,6 +2527,11 @@ std::vector<Option> get_global_options() {
     .set_default(.02)
     .set_description("slop factor to avoid switching tiering flush and eviction mode"),
 
+    Option("osd_aggregated_slow_ops_logging", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Allow OSD daemon to send an aggregated slow ops to the cluster log")
+    .set_long_description("If set to ``true``, the OSD daemon will send slow ops information in an aggregated format to the cluster log else sends every slow op to the cluster log."),
+
     Option("osd_uuid", Option::TYPE_UUID, Option::LEVEL_ADVANCED)
     .set_default(uuid_d())
     .set_flag(Option::FLAG_CREATE)
index 4ba6ef84f90850767fca53e2ab15e0764e02b67f..fa94111beed7f325bcb059dff96b02ae92fc1f6b 100644 (file)
@@ -7601,6 +7601,15 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
     int slow = 0;
     TrackedOpRef oldest_op;
+    OSDMapRef osdmap = get_osdmap();
+    // map of slow op counts by slow op event type for an aggregated logging to
+    // the cluster log.
+    map<uint8_t, int> slow_op_types;
+    // map of slow op counts by pool for reporting a pool name with highest
+    // slow ops.
+    map<uint64_t, int> slow_op_pools;
+    bool log_aggregated_slow_op =
+           cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
     auto count_slow_ops = [&](TrackedOp& op) {
       if (op.get_initiated() < too_old) {
         stringstream ss;
@@ -7610,7 +7619,19 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
            << " currently "
            << op.state_string();
         lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
-        clog->warn() << ss.str();
+        if (log_aggregated_slow_op) {
+          if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
+            uint8_t op_type = req->state_flag();
+            auto m = req->get_req<MOSDFastDispatchOp>();
+            uint64_t poolid = m->get_spg().pgid.m_pool;
+            slow_op_types[op_type]++;
+            if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
+              slow_op_pools[poolid]++;
+            }
+          }
+        } else {
+          clog->warn() << ss.str();
+        }
        slow++;
        if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
          oldest_op = &op;
@@ -7624,6 +7645,32 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
       if (slow) {
        derr << __func__ << " reporting " << slow << " slow ops, oldest is "
             << oldest_op->get_desc() << dendl;
+        if (log_aggregated_slow_op &&
+             slow_op_types.size() > 0) {
+          stringstream ss;
+          ss << slow << " slow requests (by type [ ";
+          for (const auto& [op_type, count] : slow_op_types) {
+            ss << "'" << OpRequest::get_state_string(op_type)
+               << "' : " << count
+               << " ";
+          }
+          auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
+                                 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
+                                   return p1.second < p2.second;
+                                 });
+          if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
+            string pool_name = osdmap->get_pool_name(slow_pool_it->first);
+            ss << "] most affected pool [ '"
+               << pool_name
+               << "' : "
+               << slow_pool_it->second
+               << " ])";
+          } else {
+            ss << "])";
+          }
+          lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+          clog->warn() << ss.str();
+        }
       }
       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
     } else {
index dc5b5e34b1f2d28cac3f653736885c4baddec828..443dbf292e2bf027336038a2ac27cad48c95f363 100644 (file)
@@ -102,6 +102,9 @@ public:
       return entity_name_t();
     }
   }
+  uint8_t state_flag() const {
+    return latest_flag_point;
+  }
 
   std::string_view state_string() const override {
     switch(latest_flag_point) {
@@ -116,6 +119,32 @@ public:
     return "no flag points reached";
   }
 
+  static std::string get_state_string(uint8_t flag) {
+    std::string flag_point;
+
+    switch(flag) {
+      case flag_queued_for_pg:
+        flag_point = "queued for pg";
+        break;
+      case flag_reached_pg:
+        flag_point = "reached pg";
+        break;
+      case flag_delayed:
+        flag_point = "delayed";
+        break;
+      case flag_started:
+        flag_point = "started";
+        break;
+      case flag_sub_op_sent:
+        flag_point = "waiting for sub ops";
+        break;
+      case flag_commit_sent:
+        flag_point = "commit sent; apply or cleanup";
+        break;
+    }
+    return flag_point;
+  }
+
   void mark_queued_for_pg() {
     mark_flag_point(flag_queued_for_pg, "queued_for_pg");
   }