]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/OSD: Log slow ops/types to cluster logs
authorSridhar Seshasayee <sseshasa@redhat.com>
Fri, 14 Feb 2020 11:13:52 +0000 (16:43 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Mon, 24 Feb 2020 05:44:37 +0000 (11:14 +0530)
In addition to logging slow ops in mon and osd specific log files,
re-introduce logging the same information along with slow op type
details to cluster logs as well. The objective is to make debugging
slow ops easier.

Modify the log whitelisting string to "slow request" within qa suites in
order to make the search for the new warning log message within the
cluster log successful. This should not cause any issue as it's a
substring of the earlier string.

Fixes: https://tracker.ceph.com/issues/43975
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
(cherry picked from commit d20f57000b52d1f17e2cdf666b39fef48cc288cd)

qa/suites/rados/singleton/all/mon-memory-target-compliance.yaml.disabled
qa/suites/rados/singleton/all/pg-autoscaler.yaml
qa/tasks/thrashosds-health.yaml
src/osd/OSD.cc

index 56de322ebc4e79f3f63335284a46fe386e825c08..7f9dd49592c1214cb09d5033c120dc510f130b35 100644 (file)
@@ -53,7 +53,7 @@ tasks:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
 - interactive:
 - parallel:
     - log-mon-rss
index c55fc5072f2f86aece6967820652eeb3a5a917d3..72e18d52f68b174f36efe9c6056d12ab979c7f35 100644 (file)
@@ -31,7 +31,7 @@ tasks:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
 - workunit:
     clients:
       all:
index d3a954cdab5d4f2e7bdefcf4d00232dc95071c2e..914f6e25ee55967c8a1b10aaf106d3c8e27eaa4a 100644 (file)
@@ -12,4 +12,4 @@ overrides:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
index f40a233257d12bd671fab26282f69c2fe3882594..5084a2b4a973e2312088277f204e8cfb199643ec 100644 (file)
@@ -8092,9 +8092,14 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
     TrackedOpRef oldest_op;
     auto count_slow_ops = [&](TrackedOp& op) {
       if (op.get_initiated() < too_old) {
-       lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
-                                    << " initiated "
-                                    << op.get_initiated() << dendl;
+        stringstream ss;
+        ss << "slow request " << op.get_desc()
+           << " initiated "
+           << op.get_initiated()
+           << " currently "
+           << op.state_string();
+        lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+        clog->warn() << ss.str();
        slow++;
        if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
          oldest_op = &op;