osd/OSD: Log aggregated slow ops detail to cluster logs

author Prashant D <pdhange@redhat.com>

Fri, 29 Oct 2021 13:09:24 +0000 (09:09 -0400)

committer Prashant D <pdhange@redhat.com>

Wed, 12 Jan 2022 13:29:02 +0000 (08:29 -0500)
author Prashant D <pdhange@redhat.com>
Fri, 29 Oct 2021 13:09:24 +0000 (09:09 -0400)
committer Prashant D <pdhange@redhat.com>
Wed, 12 Jan 2022 13:29:02 +0000 (08:29 -0500)
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in

index e5ce3648e921c94c2ba8b7fb56c147f6d074935b..8360e4a922390eeeba948483861f9a0753e818ad 100644 (file)
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -739,6 +739,15 @@ options:
    default: 32
    flags:
    - startup
+- name: osd_aggregated_slow_ops_logging
+  type: bool
+  level: advanced
+  desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
+  fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in 
+    an aggregated format to the cluster log else sends every slow op to the
+    cluster log.
+  default: true
+  with_legacy: true
  - name: osd_op_num_threads_per_shard
    type: int
    level: advanced
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 8977b19c85aa5714e5878bcd2d1c8924e409ccfc..889dd13c59de99ab722802aa620663f489f4dd40 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -7647,6 +7647,15 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
      too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
      int slow = 0;
      TrackedOpRef oldest_op;
+    OSDMapRef osdmap = get_osdmap();
+    // map of slow op counts by slow op event type for an aggregated logging to
+    // the cluster log.
+    map<uint8_t, int> slow_op_types;
+    // map of slow op counts by pool for reporting a pool name with highest
+    // slow ops.
+    map<uint64_t, int> slow_op_pools;
+    bool log_aggregated_slow_op =
+           cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
      auto count_slow_ops = [&](TrackedOp& op) {
        if (op.get_initiated() < too_old) {
          stringstream ss;
@@ -7656,7 +7665,19 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
             << " currently "
             << op.state_string();
          lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
-        clog->warn() << ss.str();
+        if (log_aggregated_slow_op) {
+          if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
+            uint8_t op_type = req->state_flag();
+            auto m = req->get_req<MOSDFastDispatchOp>();
+            uint64_t poolid = m->get_spg().pgid.m_pool;
+            slow_op_types[op_type]++;
+            if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
+              slow_op_pools[poolid]++;
+            }
+          }
+        } else {
+          clog->warn() << ss.str();
+        }
         slow++;
         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
           oldest_op = &op;
@@ -7670,6 +7691,32 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
        if (slow) {
         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
              << oldest_op->get_desc() << dendl;
+        if (log_aggregated_slow_op &&
+             slow_op_types.size() > 0) {
+          stringstream ss;
+          ss << slow << " slow requests (by type [ ";
+          for (const auto& [op_type, count] : slow_op_types) {
+            ss << "'" << OpRequest::get_state_string(op_type)
+               << "' : " << count
+               << " ";
+          }
+          auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
+                                 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
+                                   return p1.second < p2.second;
+                                 });
+          if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
+            string pool_name = osdmap->get_pool_name(slow_pool_it->first);
+            ss << "] most affected pool [ '"
+               << pool_name
+               << "' : "
+               << slow_pool_it->second
+               << " ])";
+          } else {
+            ss << "])";
+          }
+          lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+          clog->warn() << ss.str();
+        }
        }
        metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
      } else {
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h

index 79aa21cd392ceca598a9375a4b5ded50ee724f3a..e0bc232a53b2d8428b4aeb6d19d4a8f13cfc80d7 100644 (file)
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -103,6 +103,9 @@ public:
        return entity_name_t();
      }
    }
+  uint8_t state_flag() const {
+    return latest_flag_point;
+  }
  
    std::string_view state_string() const override {
      switch(latest_flag_point) {
@@ -117,6 +120,32 @@ public:
      return "no flag points reached";
    }
  
+  static std::string get_state_string(uint8_t flag) {
+    std::string flag_point;
+
+    switch(flag) {
+      case flag_queued_for_pg:
+        flag_point = "queued for pg";
+        break;
+      case flag_reached_pg:
+        flag_point = "reached pg";
+        break;
+      case flag_delayed:
+        flag_point = "delayed";
+        break;
+      case flag_started:
+        flag_point = "started";
+        break;
+      case flag_sub_op_sent:
+        flag_point = "waiting for sub ops";
+        break;
+      case flag_commit_sent:
+        flag_point = "commit sent; apply or cleanup";
+        break;
+    }
+    return flag_point;
+  }
+
    void mark_queued_for_pg() {
      mark_flag_point(flag_queued_for_pg, "queued_for_pg");
    }
author	Prashant D <pdhange@redhat.com>
	Fri, 29 Oct 2021 13:09:24 +0000 (09:09 -0400)
committer	Prashant D <pdhange@redhat.com>
	Wed, 12 Jan 2022 13:29:02 +0000 (08:29 -0500)
src/common/options/osd.yaml.in		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OpRequest.h		patch \| blob \| history