From 489c6ba1411356613bc2f0444442420fff9cfe6d Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Thu, 22 Feb 2024 16:55:26 +0000 Subject: [PATCH] qa/suites: added more whitelisting + fix typo Problem: 1. Not enough whitelisting for certain Cephadm failures 2. previous PR that landed has a typo that causes https://tracker.ceph.com/issues/64452 Solution: 1. Add more whitelisting 2. Fix typo in https://tracker.ceph.com/issues/64452 Fixes: https://tracker.ceph.com/issues/64452 Signed-off-by: Kamoltat --- qa/cephfs/overrides/ignorelist_health.yaml | 4 ++++ .../ignorelist_wrongly_marked_down.yaml | 2 ++ .../orch/cephadm/dashboard/task/test_e2e.yaml | 19 +++++++++++++++++++ .../orch/cephadm/mgr-nfs-upgrade/1-start.yaml | 19 +++++++++++++++---- .../cephadm/osds/2-ops/rmdir-reactivate.yaml | 1 + qa/suites/orch/cephadm/thrash/2-thrash.yaml | 12 +++++++++++- .../orch/cephadm/workunits/task/test_nfs.yaml | 1 + .../cephadm/workunits/task/test_orch_cli.yaml | 1 + .../rados/basic/tasks/rados_api_tests.yaml | 1 + qa/tasks/thrashosds-health.yaml | 4 ++++ 10 files changed, 59 insertions(+), 5 deletions(-) diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml index ac6d32045c8d..a698da517b4d 100644 --- a/qa/cephfs/overrides/ignorelist_health.yaml +++ b/qa/cephfs/overrides/ignorelist_health.yaml @@ -2,7 +2,10 @@ overrides: ceph: log-ignorelist: - overall HEALTH_ + - \(CEPHADM_STRAY_DAEMON\) - \(FS_DEGRADED\) + - FS_ + - \(CEPHADM_ - \(MDS_FAILED\) - \(MDS_DEGRADED\) - \(FS_WITH_FAILED_MDS\) @@ -13,6 +16,7 @@ overrides: - \(PG_DEGRADED\) - Degraded data redundancy - \(PG_ + - acting - MDS_INSUFFICIENT_STANDBY - deprecated feature inline_data - compat changed unexpectedly diff --git a/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml b/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml index abd26643bacf..64c8c24f5978 100644 --- a/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml +++ b/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml @@ -7,3 +7,5 @@ overrides: - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding + - is down + - osds down diff --git a/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml b/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml index b4ed447bf582..ca7268ac6892 100644 --- a/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml +++ b/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml @@ -4,6 +4,25 @@ overrides: - \(HOST_IN_MAINTENANCE\) - \(OSD_DOWN\) - \(MON_DOWN\) + - down + - overall HEALTH_ + - \(CEPHADM_STRAY_DAEMON\) + - stray daemon + - \(FS_DEGRADED\) + - \(MDS_FAILED\) + - \(MDS_DEGRADED\) + - \(FS_WITH_FAILED_MDS\) + - \(MDS_DAMAGE\) + - \(MDS_ALL_DOWN\) + - \(MDS_UP_LESS_THAN_MAX\) + - \(FS_INLINE_DATA_DEPRECATED\) + - \(PG_DEGRADED\) + - Degraded data redundancy + - \(PG_ + - acting + - MDS_INSUFFICIENT_STANDBY + - deprecated feature inline_data + - compat changed unexpectedly roles: # 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy. # More daemons will be deployed on both hosts in e2e tests. diff --git a/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml b/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml index 8a45050d4083..db4b26053949 100644 --- a/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml +++ b/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml @@ -1,7 +1,3 @@ -overrides: - ceph: - log-ignorelist: - - slow requests tasks: - cephadm.shell: host.a: @@ -28,6 +24,21 @@ openstack: size: 10 # GB overrides: ceph: + log-ignorelist: + - slow requests + - \(PG_ + - PG_ + - \(CEPHADM_STRAY_DAEMON\) + - slow request + - \(MDS_ + - MDS_ + - osds down + - OSD_ + - \(OSD_ + - client + - FS_ + - \(FS_ + - degraded conf: osd: osd shutdown pgref assert: true diff --git a/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml b/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml index e0706e0dce91..501dea155836 100644 --- a/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml +++ b/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml @@ -5,6 +5,7 @@ overrides: - \(OSD_DOWN\) - \(PG_ - but it is still running + - \(CEPHADM_STRAY_DAEMON\) tasks: - cephadm.shell: host.a: diff --git a/qa/suites/orch/cephadm/thrash/2-thrash.yaml b/qa/suites/orch/cephadm/thrash/2-thrash.yaml index 591538bad9ca..2f45d7676589 100644 --- a/qa/suites/orch/cephadm/thrash/2-thrash.yaml +++ b/qa/suites/orch/cephadm/thrash/2-thrash.yaml @@ -7,9 +7,19 @@ overrides: - \(OSDMAP_FLAGS\) - flag\(s\) set - \(CACHE_POOL_NO_HIT_SET\) + - \(CACHE_ - \(PG_ - \(OSD_ - - mons down: + - \(POOL_ + - \(CEPHADM_STRAY_DAEMON\) + - PG_ + - CACHE_ + - degraded + - backfill + - mons down + - OSD_ + - is down + - acting conf: osd: osd debug reject backfill probability: .3 diff --git a/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml b/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml index 5e1ea3d5e03b..afa9deecb8e4 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml @@ -3,6 +3,7 @@ overrides: log-ignorelist: - Replacing daemon mds - FS_DEGRADED + - \(CEPHADM_STRAY_DAEMON\) roles: - - host.a - osd.0 diff --git a/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml b/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml index 723c6ad16dc6..a1b8a4c0f899 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml @@ -3,6 +3,7 @@ overrides: log-ignorelist: - \(MON_DOWN\) - \(OSD_DOWN\) + - \(CEPHADM_PAUSED\) - mons down roles: - - host.a diff --git a/qa/suites/rados/basic/tasks/rados_api_tests.yaml b/qa/suites/rados/basic/tasks/rados_api_tests.yaml index c5c8c45ff6db..47b293e4c135 100644 --- a/qa/suites/rados/basic/tasks/rados_api_tests.yaml +++ b/qa/suites/rados/basic/tasks/rados_api_tests.yaml @@ -12,6 +12,7 @@ overrides: - \(PG_AVAILABILITY\) - \(PG_DEGRADED\) - \(MON_DOWN\) + - \(CEPHADM_STRAY_DAEMON\) - missing hit_sets - do not have an application enabled - application not enabled on pool diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml index b3101abf2d55..2340944e8851 100644 --- a/qa/tasks/thrashosds-health.yaml +++ b/qa/tasks/thrashosds-health.yaml @@ -24,3 +24,7 @@ overrides: - PG_ - Reduced data availability - stuck undersized + - backfill_toofull + - is down + - stuck peering + - acting -- 2.47.3