]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: track high water mark for purges 32667/head
authorPatrick Donnelly <pdonnell@redhat.com>
Wed, 15 Jan 2020 23:59:01 +0000 (15:59 -0800)
committerPatrick Donnelly <pdonnell@redhat.com>
Tue, 21 Jan 2020 01:06:19 +0000 (17:06 -0800)
This makes the corresponding test not racy.

Fixes: https://tracker.ceph.com/issues/16881
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
qa/tasks/cephfs/test_strays.py
src/mds/PurgeQueue.cc
src/mds/PurgeQueue.h

index 96569dfab48fd029173ed8a17731f567b239422b..e02eca2d68a1f61c2da7c975df85ffc7834b2434 100644 (file)
@@ -194,11 +194,10 @@ class TestStrays(CephFSTestCase):
             num_strays = mdc_stats['num_strays']
             num_strays_purging = pq_stats['pq_executing']
             num_purge_ops = pq_stats['pq_executing_ops']
+            files_high_water = pq_stats['pq_executing_high_water']
+            ops_high_water = pq_stats['pq_executing_ops_high_water']
 
-            self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops])
-
-            files_high_water = max(files_high_water, num_strays_purging)
-            ops_high_water = max(ops_high_water, num_purge_ops)
+            self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops, files_high_water, ops_high_water])
 
             total_strays_created = mdc_stats['strays_created']
             total_strays_purged = pq_stats['pq_executed']
@@ -242,11 +241,18 @@ class TestStrays(CephFSTestCase):
                 raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
                     ops_high_water, mds_max_purge_ops
                 ))
+            # The MDS may go over mds_max_purge_ops for some items, like a
+            # heavily fragmented directory.  The throttle does not kick in
+            # until *after* we reach or exceed the limit.  This is expected
+            # because we don't want to starve the PQ or never purge a
+            # particularly large file/directory.
+            self.assertLessEqual(ops_high_water, mds_max_purge_ops+64)
         elif throttle_type == self.FILES_THROTTLE:
             if files_high_water < mds_max_purge_files / 2:
                 raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
                     files_high_water, mds_max_purge_files
                 ))
+            self.assertLessEqual(files_high_water, mds_max_purge_files)
 
         # Sanity check all MDC stray stats
         stats = self.fs.mds_asok(['perf', 'dump'])
index 55430c97207492a13c833539f816b6ea598e0233..0c8b45c0cbbd145ef7fc080f46e7946ebe0cdd1b 100644 (file)
@@ -114,7 +114,9 @@ void PurgeQueue::create_logger()
 
   pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
   pcb.add_u64(l_pq_executing_ops, "pq_executing_ops", "Purge queue ops in flight");
+  pcb.add_u64(l_pq_executing_ops_high_water, "pq_executing_ops_high_water", "Maximum number of executing file purge ops");
   pcb.add_u64(l_pq_executing, "pq_executing", "Purge queue tasks in flight");
+  pcb.add_u64(l_pq_executing_high_water, "pq_executing_high_water", "Maximum number of executing file purges");
   pcb.add_u64(l_pq_item_in_journal, "pq_item_in_journal", "Purge item left in journal");
 
   logger.reset(pcb.create_perf_counters());
@@ -475,9 +477,13 @@ void PurgeQueue::_execute_item(
 
   in_flight[expire_to] = item;
   logger->set(l_pq_executing, in_flight.size());
+  files_high_water = std::max(files_high_water, in_flight.size());
+  logger->set(l_pq_executing_high_water, files_high_water);
   auto ops = _calculate_ops(item);
   ops_in_flight += ops;
   logger->set(l_pq_executing_ops, ops_in_flight);
+  ops_high_water = std::max(ops_high_water, ops_in_flight);
+  logger->set(l_pq_executing_ops_high_water, ops_high_water);
 
   SnapContext nullsnapc;
 
@@ -545,8 +551,12 @@ void PurgeQueue::_execute_item(
             "dropping it" << dendl;
     ops_in_flight -= ops;
     logger->set(l_pq_executing_ops, ops_in_flight);
+    ops_high_water = std::max(ops_high_water, ops_in_flight);
+    logger->set(l_pq_executing_ops_high_water, ops_high_water);
     in_flight.erase(expire_to);
     logger->set(l_pq_executing, in_flight.size());
+    files_high_water = std::max(files_high_water, in_flight.size());
+    logger->set(l_pq_executing_high_water, files_high_water);
     return;
   }
   ceph_assert(gather.has_subs());
@@ -610,11 +620,15 @@ void PurgeQueue::_execute_item_complete(
 
   ops_in_flight -= _calculate_ops(iter->second);
   logger->set(l_pq_executing_ops, ops_in_flight);
+  ops_high_water = std::max(ops_high_water, ops_in_flight);
+  logger->set(l_pq_executing_ops_high_water, ops_high_water);
 
   dout(10) << "completed item for ino " << iter->second.ino << dendl;
 
   in_flight.erase(iter);
   logger->set(l_pq_executing, in_flight.size());
+  files_high_water = std::max(files_high_water, in_flight.size());
+  logger->set(l_pq_executing_high_water, files_high_water);
   dout(10) << "in_flight.size() now " << in_flight.size() << dendl;
 
   uint64_t write_pos = journaler.get_write_pos(); 
index c1c6c10202e7d20ef6bf2b5fe634ec190490f378..4ccc2ac10421b34d1b442b57df888b8954341d8c 100644 (file)
@@ -85,7 +85,9 @@ enum {
 
   // How many items have been finished by PurgeQueue
   l_pq_executing_ops,
+  l_pq_executing_ops_high_water,
   l_pq_executing,
+  l_pq_executing_high_water,
   l_pq_executed,
   l_pq_item_in_journal,
   l_pq_last
@@ -215,5 +217,8 @@ private:
   std::vector<Context*> waiting_for_recovery;
 
   size_t purge_item_journal_size;
+
+  uint64_t ops_high_water = 0;
+  uint64_t files_high_water = 0;
 };
 #endif