From 06c94de584e6cd7d347bcdfb79d9fef4fed0d277 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 11 Sep 2017 15:21:52 -0700
Subject: [PATCH] mds: support limiting cache by memory

This introduces two config parameters:

    mds_cache_memory_limit: Sets the soft maximum of the cache to the given
    byte count. (Like mds_cache_size, this doesn't actually limit the maximum
    size of the cache. It just dictates the steady-state size.)

    mds_cache_reservation: This replaces mds_health_cache_threshold everywhere
    except the Beacon heartbeat sent to the mons. The idea here is to specify a
    reservation of memory (5% by default) for operations and the MDS tries to
    always maintain that reservation. So, the MDS will recall caps from clients
    when it begins dipping into its reservation of memory.

mds_cache_size still limits the cache by Inode count but is now by-default 0
(i.e. unlimited). The new preferred way of specifying cache limits is by memory
size. The default is 1GB.

Fixes: http://tracker.ceph.com/issues/20594
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1464976

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 PendingReleaseNotes                           |   9 ++
 doc/cephfs/health-messages.rst                |  28 ++--
 doc/cephfs/mds-config-ref.rst                 |  23 +++-
 .../basic_functional/tasks/client-limits.yaml |   1 +
 qa/tasks/cephfs/test_client_limits.py         |   8 +-
 src/common/legacy_config_opts.h               |   3 -
 src/common/options.cc                         |  22 ++-
 src/mds/Beacon.cc                             |   8 +-
 src/mds/MDCache.cc                            | 126 ++++++++++--------
 src/mds/MDCache.h                             |  37 ++++-
 src/mds/MDLog.cc                              |   2 +-
 src/mds/Migrator.cc                           |   4 +-
 src/mds/Server.cc                             |  19 +--
 src/mds/Server.h                              |   2 +-
 14 files changed, 188 insertions(+), 104 deletions(-)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 51b5b3b7f989b..c730f7ac44b02 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,2 +1,11 @@
 >= 12.2.0
 ---------
+
+- *CephFS*:
+
+  * Limiting MDS cache via a memory limit is now supported using the new
+    mds_cache_memory_limit config option (1GB by default).  A cache reservation
+    can also be specified using mds_cache_reservation as a percentage of the
+    limit (5% by default). Limits by inode count are still supported using
+    mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
+    inode limit.
diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst
index 5e9f796787b59..adaafb842e201 100644
--- a/doc/cephfs/health-messages.rst
+++ b/doc/cephfs/health-messages.rst
@@ -73,14 +73,14 @@ so at all.  This message appears if a client has taken longer than
 
 Message: "Client *name* failing to respond to cache pressure"
 Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
-Description: Clients maintain a metadata cache.  Items (such as inodes)
-in the client cache are also pinned in the MDS cache, so when the MDS
-needs to shrink its cache (to stay within ``mds_cache_size``), it
-sends messages to clients to shrink their caches too.  If the client
-is unresponsive or buggy, this can prevent the MDS from properly staying
-within its ``mds_cache_size`` and it may eventually run out of memory
-and crash.  This message appears if a client has taken more than
-``mds_recall_state_timeout`` (default 60s) to comply.
+Description: Clients maintain a metadata cache.  Items (such as inodes) in the
+client cache are also pinned in the MDS cache, so when the MDS needs to shrink
+its cache (to stay within ``mds_cache_size`` or ``mds_cache_memory_limit``), it
+sends messages to clients to shrink their caches too.  If the client is
+unresponsive or buggy, this can prevent the MDS from properly staying within
+its cache limits and it may eventually run out of memory and crash.  This
+message appears if a client has taken more than ``mds_recall_state_timeout``
+(default 60s) to comply.
 
 Message: "Client *name* failing to advance its oldest client/flush tid"
 Code: MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY
@@ -121,9 +121,9 @@ This message appears if any client requests have taken longer than
 
 Message: "Too many inodes in cache"
 Code: MDS_HEALTH_CACHE_OVERSIZED
-Description: The MDS is not succeeding in trimming its cache to comply
-with the limit set by the administrator.  If the MDS cache becomes too large,
-the daemon may exhaust available memory and crash.
-This message appears if the actual cache size (in inodes) is at least 50%
-greater than ``mds_cache_size`` (default 100000).
-
+Description: The MDS is not succeeding in trimming its cache to comply with the
+limit set by the administrator.  If the MDS cache becomes too large, the daemon
+may exhaust available memory and crash.  By default, this message appears if
+the actual cache size (in inodes or memory) is at least 50% greater than
+``mds_cache_size`` (default 100000) or ``mds_cache_memory_limit`` (default
+1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio.
diff --git a/doc/cephfs/mds-config-ref.rst b/doc/cephfs/mds-config-ref.rst
index b3446d698dbba..4f7bea3ef8585 100644
--- a/doc/cephfs/mds-config-ref.rst
+++ b/doc/cephfs/mds-config-ref.rst
@@ -19,13 +19,30 @@
 :Type:  64-bit Integer Unsigned
 :Default:  ``1ULL << 40``
 
+``mds cache memory limit``
+
+:Description: The memory limit the MDS should enforce for its cache.
+              Administrators should use this instead of ``mds cache size``.
+:Type:  64-bit Integer Unsigned
+:Default: ``1073741824``
+
+``mds cache reservation``
+
+:Description: The cache reservation (memory or inodes) for the MDS cache to maintain.
+              Once the MDS begins dipping into its reservation, it will recall
+              client state until its cache size shrinks to restore the
+              reservation.
+:Type:  Float
+:Default: ``0.05``
 
 ``mds cache size``
 
-:Description: The number of inodes to cache.
+:Description: The number of inodes to cache. A value of 0 indicates an
+              unlimited number. It is recommended to use
+              ``mds_cache_memory_limit`` to limit the amount of memory the MDS
+              cache uses.
 :Type:  32-bit Integer
-:Default: ``100000``
-
+:Default: ``0``
 
 ``mds cache mid``
 
diff --git a/qa/suites/fs/basic_functional/tasks/client-limits.yaml b/qa/suites/fs/basic_functional/tasks/client-limits.yaml
index 754b1d33f412b..635d0b6d82565 100644
--- a/qa/suites/fs/basic_functional/tasks/client-limits.yaml
+++ b/qa/suites/fs/basic_functional/tasks/client-limits.yaml
@@ -9,6 +9,7 @@ overrides:
       - failing to respond to cache pressure
       - slow requests are blocked
       - failing to respond to capability release
+      - MDS cache is too large
       - \(MDS_CLIENT_OLDEST_TID\)
       - \(MDS_CACHE_OVERSIZED\)
 
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
index 3f8ffa8758f79..b06d2a1d233fa 100644
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -81,12 +81,12 @@ class TestClientLimits(CephFSTestCase):
             pass
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
-        # which depend on the cache size and overall ratio
+        # which depend on the caps outstanding, cache size and overall ratio
         self.wait_until_equal(
             lambda: self.get_session(mount_a_client_id)['num_caps'],
-            int(cache_size * 0.8),
-            timeout=600,
-            reject_fn=lambda x: x < int(cache_size*.8))
+            int(open_files * 0.2),
+            timeout=30,
+            reject_fn=lambda x: x < int(open_files*0.2))
 
     @needs_trimming
     def test_client_pin_root(self):
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 1a846818ff0b4..00ffae7a6139f 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -438,8 +438,6 @@ OPTION(mds_data, OPT_STR)
 OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
 // max xattr kv pairs size for each dir/file
 OPTION(mds_max_xattr_pairs_size, OPT_U32)
-OPTION(mds_cache_size, OPT_INT)
-OPTION(mds_cache_mid, OPT_FLOAT)
 OPTION(mds_max_file_recover, OPT_U32)
 OPTION(mds_dir_max_commit_size, OPT_INT) // MB
 OPTION(mds_dir_keys_per_op, OPT_INT)
@@ -459,7 +457,6 @@ OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't tr
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
-OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
 	      //  make it (mds_session_timeout - mds_beacon_grace)
 OPTION(mds_tick_interval, OPT_FLOAT)
diff --git a/src/common/options.cc b/src/common/options.cc
index e6c678b3330a3..8d625a854e976 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -5309,8 +5309,22 @@ std::vector<Option> get_mds_options() {
     .set_description(""),
 
     Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(100000)
-    .set_description(""),
+    .set_default(0)
+    .set_description("maximum number of inodes in MDS cache (<=0 is unlimited)")
+    .set_long_description("This tunable is no longer recommended. Use mds_cache_memory_limit."),
+
+    Option("mds_cache_memory_limit", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(1*(1LL<<30))
+    .set_description("target maximum memory usage of MDS cache")
+    .set_long_description("This sets a target maximum memory usage of the MDS cache and is the primary tunable to limit the MDS memory usage. The MDS will try to stay under a reservation of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata in its cache and recalling cached items in the client caches. It is possible for the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold (150%) sets a cache full threshold for when the MDS signals a cluster health warning."),
+
+    Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .set_description("amount of memory to reserve"),
+
+    Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description("threshold for cache size to generate health warning"),
 
     Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(.7)
@@ -5384,10 +5398,6 @@ std::vector<Option> get_mds_options() {
     .set_default(10)
     .set_description(""),
 
-    Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(1.5)
-    .set_description(""),
-
     Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(45)
     .set_description(""),
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index aae46eab4d775..e6bf3930e9132 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -15,6 +15,7 @@
 
 #include "common/dout.h"
 #include "common/HeartbeatMap.h"
+
 #include "include/stringify.h"
 #include "include/util.h"
 
@@ -475,11 +476,10 @@ void Beacon::notify_health(MDSRank const *mds)
   }
 
   // Report if we have significantly exceeded our cache size limit
-  if (mds->mdcache->get_cache_size() >
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (mds->mdcache->cache_overfull()) {
     std::ostringstream oss;
-    oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
-        << "/" << g_conf->mds_cache_size << "), "
+    oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+        << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
         << mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
         << mds->mdcache->get_num_strays() << " stray files";
 
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 343167c99e176..d61c3dc766d9c 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -39,14 +39,16 @@
 
 #include "include/ceph_fs.h"
 #include "include/filepath.h"
+#include "include/util.h"
 
 #include "msg/Message.h"
 #include "msg/Messenger.h"
 
+#include "common/MemoryModel.h"
 #include "common/errno.h"
-#include "common/safe_io.h"
 #include "common/perf_counters.h"
-#include "common/MemoryModel.h"
+#include "common/safe_io.h"
+
 #include "osdc/Journaler.h"
 #include "osdc/Filer.h"
 
@@ -202,7 +204,7 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
   cap_imports_num_opening = 0;
 
   opening_root = open = false;
-  lru.lru_set_midpoint(g_conf->mds_cache_mid);
+  lru.lru_set_midpoint(cache_mid());
 
   bottom_lru.lru_set_midpoint(0);
 
@@ -222,7 +224,7 @@ MDCache::~MDCache()
 
 void MDCache::log_stat()
 {
-  mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
+  mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
   mds->logger->set(l_mds_inodes, lru.lru_get_size());
   mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
   mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
@@ -277,8 +279,7 @@ void MDCache::add_inode(CInode *in)
       base_inodes.insert(in);
   }
 
-  if (CInode::count() >
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (cache_toofull()) {
     exceeded_size_limit = true;
   }
 }
@@ -6412,34 +6413,19 @@ void MDCache::start_recovered_truncates()
 // ================================================================================
 // cache trimming
 
-
-/*
- * note: only called while MDS is active or stopping... NOT during recovery.
- * however, we may expire a replica whose authority is recovering.
- * 
- */
-bool MDCache::trim(int max, int count)
-{
-  // trim LRU
-  if (count > 0) {
-    max = lru.lru_get_size() - count;
-    if (max <= 0)
-      max = 1;
-  } else if (max < 0) {
-    max = g_conf->mds_cache_size;
-    if (max <= 0)
-      return false;
-  }
-  dout(7) << "trim max=" << max << "  cur=" << lru.lru_get_size()
-	  << "/" << bottom_lru.lru_get_size() << dendl;
-
-  // process delayed eval_stray()
-  stray_manager.advance_delayed();
-
-  map<mds_rank_t, MCacheExpire*> expiremap;
+void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+{
   bool is_standby_replay = mds->is_standby_replay();
-  int unexpirable = 0;
-  list<CDentry*> unexpirables;
+  std::vector<CDentry *> unexpirables;
+  uint64_t trimmed = 0;
+
+  dout(7) << "trim_lru trimming " << count
+          << " items from LRU"
+          << " size=" << lru.lru_get_size()
+          << " mid=" << lru.lru_get_top()
+          << " pintail=" << lru.lru_get_pintail()
+          << " pinned=" << lru.lru_get_num_pinned()
+          << dendl;
 
   for (;;) {
     CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
@@ -6447,34 +6433,65 @@ bool MDCache::trim(int max, int count)
       break;
     if (trim_dentry(dn, expiremap)) {
       unexpirables.push_back(dn);
-      ++unexpirable;
+    } else {
+      trimmed++;
     }
   }
 
-  for(auto dn : unexpirables)
+  for (auto &dn : unexpirables) {
     bottom_lru.lru_insert_mid(dn);
+  }
   unexpirables.clear();
 
-  // trim dentries from the LRU: only enough to satisfy `max`,
-  while (lru.lru_get_size() + unexpirable > (unsigned)max) {
+  // trim dentries from the LRU until count is reached
+  while (cache_toofull() || count > 0) {
     CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
     if (!dn) {
       break;
     }
     if ((is_standby_replay && dn->get_linkage()->inode &&
-        dn->get_linkage()->inode->item_open_file.is_on_list()) ||
-	trim_dentry(dn, expiremap)) {
+        dn->get_linkage()->inode->item_open_file.is_on_list())) {
       unexpirables.push_back(dn);
-      ++unexpirable;
+    } else if (trim_dentry(dn, expiremap)) {
+      unexpirables.push_back(dn);
+    } else {
+      trimmed++;
     }
+    count--;
   }
-  for(auto dn : unexpirables)
+
+  for (auto &dn : unexpirables) {
     lru.lru_insert_mid(dn);
+  }
   unexpirables.clear();
 
+  dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+bool MDCache::trim(uint64_t count)
+{
+  uint64_t used = cache_size();
+  uint64_t limit = cache_limit_memory();
+  map<mds_rank_t, MCacheExpire*> expiremap;
+
+  dout(7) << "trim bytes_used=" << bytes2str(used)
+          << " limit=" << bytes2str(limit)
+          << " reservation=" << cache_reservation()
+          << "% count=" << count << dendl;
+
+  // process delayed eval_stray()
+  stray_manager.advance_delayed();
+
+  trim_lru(count, expiremap);
+
   // trim non-auth, non-bound subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
-       p != subtrees.end();) {
+  for (auto p = subtrees.begin(); p != subtrees.end();) {
     CDir *dir = p->first;
     ++p;
     CInode *diri = dir->get_inode();
@@ -6507,7 +6524,7 @@ bool MDCache::trim(int max, int count)
   }
 
   // trim root?
-  if (max == 0 && root) {
+  if (mds->is_stopping() && root) {
     list<CDir*> ls;
     root->get_dirfrags(ls);
     for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
@@ -6550,7 +6567,7 @@ bool MDCache::trim(int max, int count)
   }
 
   // Other rank's base inodes (when I'm stopping)
-  if (max == 0) {
+  if (mds->is_stopping()) {
     for (set<CInode*>::iterator p = base_inodes.begin();
          p != base_inodes.end(); ++p) {
       if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
@@ -7400,9 +7417,9 @@ void MDCache::check_memory_usage()
 
   // check client caps
   assert(CInode::count() == inode_map.size() + snap_inode_map.size());
-  float caps_per_inode = 0.0;
+  double caps_per_inode = 0.0;
   if (CInode::count())
-    caps_per_inode = (float)Capability::count() / (float)CInode::count();
+    caps_per_inode = (double)Capability::count() / (double)CInode::count();
 
   dout(2) << "check_memory_usage"
 	   << " total " << last.get_total()
@@ -7418,20 +7435,15 @@ void MDCache::check_memory_usage()
   mds->mlogger->set(l_mdm_rss, last.get_rss());
   mds->mlogger->set(l_mdm_heap, last.get_heap());
 
-  if (num_inodes_with_caps > g_conf->mds_cache_size) {
-    float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
-    if (ratio < 1.0) {
-      last_recall_state = ceph_clock_now();
-      mds->server->recall_client_state(ratio);
-    }
+  if (cache_toofull()) {
+    last_recall_state = ceph_clock_now();
+    mds->server->recall_client_state();
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
   // now, free any unused pool memory so that our memory usage isn't
   // permanently bloated.
-  if (exceeded_size_limit
-      && CInode::count() <=
-        g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+  if (exceeded_size_limit && !cache_toofull()) {
     // Only do this once we are back in bounds: otherwise the releases would
     // slow down whatever process caused us to exceed bounds to begin with
     if (ceph_using_tcmalloc()) {
@@ -7523,7 +7535,7 @@ bool MDCache::shutdown_pass()
   }
 
   // trim cache
-  trim(0);
+  trim(UINT64_MAX);
   dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
 
   // SUBTREES
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 4af04efbdbef3..a2ab48751a0b9 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -147,6 +147,38 @@ class MDCache {
   bool exceeded_size_limit;
 
 public:
+  static uint64_t cache_limit_inodes(void) {
+    return g_conf->get_val<int64_t>("mds_cache_size");
+  }
+  static uint64_t cache_limit_memory(void) {
+    return g_conf->get_val<uint64_t>("mds_cache_memory_limit");
+  }
+  static double cache_reservation(void) {
+    return g_conf->get_val<double>("mds_cache_reservation");
+  }
+  static double cache_mid(void) {
+    return g_conf->get_val<double>("mds_cache_mid");
+  }
+  static double cache_health_threshold(void) {
+    return g_conf->get_val<double>("mds_health_cache_threshold");
+  }
+  double cache_toofull_ratio(void) const {
+    uint64_t inode_limit = cache_limit_inodes();
+    double inode_reserve = inode_limit*(1.0-cache_reservation());
+    double memory_reserve = cache_limit_memory()*(1.0-cache_reservation());
+    return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
+  }
+  bool cache_toofull(void) const {
+    return cache_toofull_ratio() > 0.0;
+  }
+  uint64_t cache_size(void) const {
+    return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+  }
+  bool cache_overfull(void) const {
+    uint64_t inode_limit = cache_limit_inodes();
+    return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold());
+  }
+
   void advance_stray() {
     stray_index = (stray_index+1)%NUM_STRAY;
   }
@@ -675,7 +707,9 @@ public:
   size_t get_cache_size() { return lru.lru_get_size(); }
 
   // trimming
-  bool trim(int max=-1, int count=-1);   // trim cache
+  bool trim(uint64_t count=0);
+private:
+  void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_dirfrag(CDir *dir, CDir *con,
 		    map<mds_rank_t, MCacheExpire*>& expiremap);
@@ -683,6 +717,7 @@ public:
 		  map<mds_rank_t,class MCacheExpire*>& expiremap);
   void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
   void trim_non_auth();      // trim out trimmable non-auth items
+public:
   bool trim_non_auth_subtree(CDir *directory);
   void standby_trim_segment(LogSegment *ls);
   void try_trim_non_auth_subtree(CDir *dir);
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index cb1f9b558b24c..806974a1d0b7d 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -1446,7 +1446,7 @@ void MDLog::standby_trim_segments()
 
   if (removed_segment) {
     dout(20) << " calling mdcache->trim!" << dendl;
-    mds->mdcache->trim(-1);
+    mds->mdcache->trim();
   } else {
     dout(20) << " removed no segments!" << dendl;
   }
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 59a82dcd42be2..fbf75c913634d 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2007,7 +2007,7 @@ void Migrator::export_finish(CDir *dir)
   cache->show_subtrees();
   audit();
 
-  cache->trim(-1, num_dentries); // try trimming exported dentries
+  cache->trim(num_dentries); // try trimming exported dentries
 
   // send pending import_maps?
   mds->mdcache->maybe_send_pending_resolves();
@@ -2650,7 +2650,7 @@ void Migrator::import_reverse(CDir *dir)
   // log our failure
   mds->mdlog->start_submit_entry(new EImportFinish(dir, false));	// log failure
 
-  cache->trim(-1, num_dentries); // try trimming dentries
+  cache->trim(num_dentries); // try trimming dentries
 
   // notify bystanders; wait in aborting state
   import_notify_abort(dir, bounds);
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 7141d0ff0e614..a5ea0ca09bb73 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1080,10 +1080,16 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(float ratio)
+void Server::recall_client_state(void)
 {
-  int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
-  int min_caps_per_client = 100;
+  /* try to recall at least 80% of all caps */
+  uint64_t max_caps_per_client = (Capability::count() * .8);
+  uint64_t min_caps_per_client = 100;
+  /* unless this ratio is smaller: */
+  /* ratio: determine the amount of caps to recall from each client. Use
+   * percentage full over the cache reservation. Cap the ratio at 80% of client
+   * caps. */
+  double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
 
   dout(10) << "recall_client_state " << ratio
 	   << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
@@ -1091,10 +1097,7 @@ void Server::recall_client_state(float ratio)
 
   set<Session*> sessions;
   mds->sessionmap.get_client_session_set(sessions);
-  for (set<Session*>::const_iterator p = sessions.begin();
-       p != sessions.end();
-       ++p) {
-    Session *session = *p;
+  for (auto &session : sessions) {
     if (!session->is_open() ||
 	!session->info.inst.name.is_client())
       continue;
@@ -1105,7 +1108,7 @@ void Server::recall_client_state(float ratio)
 	     << dendl;
 
     if (session->caps.size() > min_caps_per_client) {	
-      int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
+      uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
       if (session->caps.size() > newlim) {
           MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
           m->head.max_caps = newlim;
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 752ba10b326ee..2543953bab5fe 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -133,7 +133,7 @@ public:
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(float ratio);
+  void recall_client_state(void);
   void force_clients_readonly();
 
   // -- requests --
-- 
2.39.5