From 1be67670264b3c780345c014d176c27e670bccdb Mon Sep 17 00:00:00 2001
From: Jos Collin <jcollin@redhat.com>
Date: Fri, 9 Jul 2021 16:56:47 +0530
Subject: [PATCH] qa: test `ceph fs perf stats` doesn't output stale metrics

That `ceph fs perf stats` doesn't output stale metrics
after the rank0 MDS failover.

Fixes: https://tracker.ceph.com/issues/50033
Signed-off-by: Jos Collin <jcollin@redhat.com>
(cherry picked from commit 116e89a2f2849ed7cb711d1ae465c6f510b2810d)
---
 qa/tasks/cephfs/test_mds_metrics.py | 58 ++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/qa/tasks/cephfs/test_mds_metrics.py b/qa/tasks/cephfs/test_mds_metrics.py
index be680bb8600d5..4c85d9f251029 100644
--- a/qa/tasks/cephfs/test_mds_metrics.py
+++ b/qa/tasks/cephfs/test_mds_metrics.py
@@ -5,7 +5,7 @@ import random
 import logging
 import errno
 
-from teuthology.contextutil import safe_while
+from teuthology.contextutil import safe_while, MaxWhileTries
 from teuthology.exceptions import CommandFailedError
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 
@@ -394,3 +394,59 @@ class TestMDSMetrics(CephFSTestCase):
                 raise
         else:
             raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_ip")
+
+    def test_perf_stats_stale_metrics(self):
+        """
+        That `ceph fs perf stats` doesn't output stale metrics after the rank0 MDS failover
+        """
+        # validate
+        valid, metrics = self._get_metrics(self.verify_mds_metrics(
+            active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug("metrics={0}".format(metrics))
+        self.assertTrue(valid)
+
+        global_metrics = metrics['global_metrics']
+
+        #TestMDSMetrics.CLIENTS_REQUIRED clients are mounted here. So they should be
+        #the first two entries in the global_metrics and won't be culled later on.
+        gm_keys_list = list(global_metrics.keys())
+        client1_metrics = global_metrics[gm_keys_list[0]]
+        client2_metrics = global_metrics[gm_keys_list[1]]
+
+        #fail rank0 mds
+        self.fs.rank_fail(rank=0)
+
+        # Wait for 10 seconds for the failover to complete and
+        # the mgr to get initial metrics from the new rank0 mds.
+        time.sleep(10)
+
+        fscid = self.fs.id
+
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        try:
+            valid, metrics_new = self._get_metrics(self.verify_mds_metrics(
+                active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+            log.debug("metrics={0}".format(metrics_new))
+            self.assertTrue(valid)
+
+            global_metrics = metrics_new['global_metrics']
+            client1_metrics_new = global_metrics[gm_keys_list[0]]
+            client2_metrics_new = global_metrics[gm_keys_list[1]]
+
+            #the metrics should be different for the test to succeed.
+            self.assertNotEqual(client1_metrics, client1_metrics_new)
+            self.assertNotEqual(client2_metrics, client2_metrics_new)
+        except MaxWhileTries:
+            raise RuntimeError("Failed to fetch `ceph fs perf stats` metrics")
+        finally:
+            # cleanup test directories
+            self._cleanup_test_dirs()
-- 
2.39.5