]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd/OSD: Log slow ops/types to cluster logs
authorSridhar Seshasayee <sseshasa@redhat.com>
Fri, 14 Feb 2020 11:13:52 +0000 (16:43 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Wed, 19 Feb 2020 09:01:48 +0000 (14:31 +0530)
In addition to logging slow ops in mon and osd specific log files,
re-introduce logging the same information along with slow op type
details to cluster logs as well. The objective is to make debugging
slow ops easier.

Modify the log whitelisting string to "slow request" within qa suites in
order to make the search for the new warning log message within the
cluster log successful. This should not cause any issue as it's a
substring of the earlier string.

Fixes: https://tracker.ceph.com/issues/43975
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
qa/suites/rados/singleton/all/mon-memory-target-compliance.yaml.disabled
qa/suites/rados/singleton/all/pg-autoscaler.yaml
qa/tasks/thrashosds-health.yaml
src/osd/OSD.cc

index 56de322ebc4e79f3f63335284a46fe386e825c08..7f9dd49592c1214cb09d5033c120dc510f130b35 100644 (file)
@@ -53,7 +53,7 @@ tasks:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
 - interactive:
 - parallel:
     - log-mon-rss
index c55fc5072f2f86aece6967820652eeb3a5a917d3..72e18d52f68b174f36efe9c6056d12ab979c7f35 100644 (file)
@@ -31,7 +31,7 @@ tasks:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
 - workunit:
     clients:
       all:
index d3a954cdab5d4f2e7bdefcf4d00232dc95071c2e..914f6e25ee55967c8a1b10aaf106d3c8e27eaa4a 100644 (file)
@@ -12,4 +12,4 @@ overrides:
       - \(SLOW_OPS\)
       - \(REQUEST_SLOW\)
       - \(TOO_FEW_PGS\)
-      - slow requests
+      - slow request
index 76113d114686940b1d01f8afe147697ab5f6a5bb..1c122163774cc80afb92c0151a873a165955a4db 100644 (file)
@@ -7596,9 +7596,14 @@ vector<DaemonHealthMetric> OSD::get_health_metrics()
     TrackedOpRef oldest_op;
     auto count_slow_ops = [&](TrackedOp& op) {
       if (op.get_initiated() < too_old) {
-       lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
-                                    << " initiated "
-                                    << op.get_initiated() << dendl;
+        stringstream ss;
+        ss << "slow request " << op.get_desc()
+           << " initiated "
+           << op.get_initiated()
+           << " currently "
+           << op.state_string();
+        lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+        clog->warn() << ss.str();
        slow++;
        if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
          oldest_op = &op;