From d718b3b01a3d26a2719fb26e3e5316d51982ec51 Mon Sep 17 00:00:00 2001 From: Jos Collin Date: Wed, 25 Nov 2020 15:38:08 +0530 Subject: [PATCH] qa: test DispatchQueue throttling Fixes: https://tracker.ceph.com/issues/46226 Signed-off-by: Jos Collin --- .../multiclient/tasks/cephfs_misc_tests.yaml | 1 + .../fs/thrash/multifs/tasks/1-thrash/mds.yaml | 1 + .../fs/thrash/multifs/tasks/1-thrash/mon.yaml | 1 + .../thrash/workloads/tasks/1-thrash/mds.yaml | 1 + .../thrash/workloads/tasks/1-thrash/mon.yaml | 1 + .../thrash/workloads/tasks/1-thrash/osd.yaml | 1 + .../all/dispatch-queue-throttle-warnings.yaml | 18 +++++++++++ qa/tasks/cephfs/test_misc.py | 22 +++++++++++++ .../test_dispatch_queue_throttle_warnings.sh | 32 +++++++++++++++++++ 9 files changed, 78 insertions(+) create mode 100644 qa/suites/rados/singleton-nomsgr/all/dispatch-queue-throttle-warnings.yaml create mode 100644 qa/workunits/rados/test_dispatch_queue_throttle_warnings.sh diff --git a/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml b/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml index e6d6ef99b15..14dbb474bd0 100644 --- a/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml +++ b/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml @@ -12,3 +12,4 @@ overrides: - MDS_CLIENT_LATE_RELEASE - responding to mclientcaps - RECENT_CRASH + - DISPATCH_QUEUE_THROTTLE diff --git a/qa/suites/fs/thrash/multifs/tasks/1-thrash/mds.yaml b/qa/suites/fs/thrash/multifs/tasks/1-thrash/mds.yaml index 33748cea5cd..575a7ba1f1d 100644 --- a/qa/suites/fs/thrash/multifs/tasks/1-thrash/mds.yaml +++ b/qa/suites/fs/thrash/multifs/tasks/1-thrash/mds.yaml @@ -5,3 +5,4 @@ overrides: ceph: log-ignorelist: - Replacing daemon mds + - DISPATCH_QUEUE_THROTTLE diff --git a/qa/suites/fs/thrash/multifs/tasks/1-thrash/mon.yaml b/qa/suites/fs/thrash/multifs/tasks/1-thrash/mon.yaml index fbbe16151ce..0672ede4647 100644 --- a/qa/suites/fs/thrash/multifs/tasks/1-thrash/mon.yaml +++ b/qa/suites/fs/thrash/multifs/tasks/1-thrash/mon.yaml @@ -3,6 +3,7 @@ overrides: log-ignorelist: - overall HEALTH_ - \(MON_DOWN\) + - DISPATCH_QUEUE_THROTTLE tasks: - mon_thrash: check_mds_failover: True diff --git a/qa/suites/fs/thrash/workloads/tasks/1-thrash/mds.yaml b/qa/suites/fs/thrash/workloads/tasks/1-thrash/mds.yaml index 33748cea5cd..575a7ba1f1d 100644 --- a/qa/suites/fs/thrash/workloads/tasks/1-thrash/mds.yaml +++ b/qa/suites/fs/thrash/workloads/tasks/1-thrash/mds.yaml @@ -5,3 +5,4 @@ overrides: ceph: log-ignorelist: - Replacing daemon mds + - DISPATCH_QUEUE_THROTTLE diff --git a/qa/suites/fs/thrash/workloads/tasks/1-thrash/mon.yaml b/qa/suites/fs/thrash/workloads/tasks/1-thrash/mon.yaml index fbbe16151ce..0672ede4647 100644 --- a/qa/suites/fs/thrash/workloads/tasks/1-thrash/mon.yaml +++ b/qa/suites/fs/thrash/workloads/tasks/1-thrash/mon.yaml @@ -3,6 +3,7 @@ overrides: log-ignorelist: - overall HEALTH_ - \(MON_DOWN\) + - DISPATCH_QUEUE_THROTTLE tasks: - mon_thrash: check_mds_failover: True diff --git a/qa/suites/fs/thrash/workloads/tasks/1-thrash/osd.yaml b/qa/suites/fs/thrash/workloads/tasks/1-thrash/osd.yaml index 037d399a702..759816764b7 100644 --- a/qa/suites/fs/thrash/workloads/tasks/1-thrash/osd.yaml +++ b/qa/suites/fs/thrash/workloads/tasks/1-thrash/osd.yaml @@ -5,5 +5,6 @@ overrides: - objects unfound and apparently lost - MDS_SLOW_METADATA_IO - MDS_TRIM + - DISPATCH_QUEUE_THROTTLE tasks: - thrashosds: diff --git a/qa/suites/rados/singleton-nomsgr/all/dispatch-queue-throttle-warnings.yaml b/qa/suites/rados/singleton-nomsgr/all/dispatch-queue-throttle-warnings.yaml new file mode 100644 index 00000000000..e572cae45f6 --- /dev/null +++ b/qa/suites/rados/singleton-nomsgr/all/dispatch-queue-throttle-warnings.yaml @@ -0,0 +1,18 @@ +roles: +- [mon.a, mgr.x, osd.0, client.0] +tasks: +- install: +- ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + conf: + osd: +# we may land on ext4 + osd max object name len: 400 + osd max object namespace len: 64 + log-ignorelist: + - DISPATCH_QUEUE_THROTTLE +- workunit: + clients: + all: + - rados/test_dispatch_queue_throttle_warnings.sh diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py index 14f54a784e7..026e7a170c6 100644 --- a/qa/tasks/cephfs/test_misc.py +++ b/qa/tasks/cephfs/test_misc.py @@ -527,6 +527,28 @@ class TestMisc(CephFSTestCase): self.run_ceph_cmd('tell', 'cephfs.c', 'something') self.assertEqual(ce.exception.exitstatus, 1) + def test_dispatch_queue_throttle_cluster_log(self): + """ + That cluster log a warning when the Dispatch Queue Throttle Limit hits. + """ + self.config_set('global', 'ms_dispatch_throttle_bytes', 500) + self.config_set('global', 'ms_dispatch_throttle_log_interval', 5) + # Create files & split across 10 directories, 1000 each. + with self.assert_cluster_log("DISPATCH_QUEUE_THROTTLE", + invert_match=False, watch_channel="cluster"): + for i in range(0, 10): + self.mount_a.create_n_files("dir{0}/file".format(i), 1000, sync=False) + + def test_dispatch_queue_throttle_health_warn(self): + """ + That a health warning is generated when the Dispatch Queue Throttle Limit hits. + """ + self.config_set('global', 'ms_dispatch_throttle_bytes', 10) + self.config_set('global', 'ms_dispatch_throttle_log_interval', 1) + # Create files & split across 10 directories, 1000 each. + for i in range(0, 10): + self.mount_a.create_n_files("dir{0}/file".format(i), 1000, sync=False) + self.wait_for_health("DISPATCH_QUEUE_THROTTLE", 120) @classhook('_add_session_client_evictions') class TestSessionClientEvict(CephFSTestCase): diff --git a/qa/workunits/rados/test_dispatch_queue_throttle_warnings.sh b/qa/workunits/rados/test_dispatch_queue_throttle_warnings.sh new file mode 100644 index 00000000000..7f50212c9e0 --- /dev/null +++ b/qa/workunits/rados/test_dispatch_queue_throttle_warnings.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -uex + +# number of osds = 1 +crushtool -o crushmap --build --num_osds 1 host straw 2 rack straw 2 row straw 2 root straw 0 +ceph osd setcrushmap -i crushmap +ceph osd tree +ceph tell osd.* injectargs --osd_max_markdown_count 1024 --osd_max_markdown_period 1 +ceph osd set noout + +wait_for_healthy() { + while ceph health detail | grep "DISPATCH_QUEUE_THROTTLE" + do + sleep 1 + done +} + +test_dispatch_queue_throttle() { + ceph config set global ms_dispatch_throttle_bytes 10 + ceph config set global ms_dispatch_throttle_log_interval 1 + ceph health detail + ceph health | grep "Dispatch Queue Throttling" + ceph health detail | grep "DISPATCH_QUEUE_THROTTLE" + ceph config set global ms_dispatch_throttle_bytes 104857600 # default: 100_M + ceph config set global ms_dispatch_throttle_log_interval 30 + wait_for_healthy +} + +test_dispatch_queue_throttle + +exit 0 -- 2.47.3