From 06c94de584e6cd7d347bcdfb79d9fef4fed0d277 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 11 Sep 2017 15:21:52 -0700 Subject: [PATCH] mds: support limiting cache by memory This introduces two config parameters: mds_cache_memory_limit: Sets the soft maximum of the cache to the given byte count. (Like mds_cache_size, this doesn't actually limit the maximum size of the cache. It just dictates the steady-state size.) mds_cache_reservation: This replaces mds_health_cache_threshold everywhere except the Beacon heartbeat sent to the mons. The idea here is to specify a reservation of memory (5% by default) for operations and the MDS tries to always maintain that reservation. So, the MDS will recall caps from clients when it begins dipping into its reservation of memory. mds_cache_size still limits the cache by Inode count but is now by-default 0 (i.e. unlimited). The new preferred way of specifying cache limits is by memory size. The default is 1GB. Fixes: http://tracker.ceph.com/issues/20594 Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1464976 Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 9 ++ doc/cephfs/health-messages.rst | 28 ++-- doc/cephfs/mds-config-ref.rst | 23 +++- .../basic_functional/tasks/client-limits.yaml | 1 + qa/tasks/cephfs/test_client_limits.py | 8 +- src/common/legacy_config_opts.h | 3 - src/common/options.cc | 22 ++- src/mds/Beacon.cc | 8 +- src/mds/MDCache.cc | 126 ++++++++++-------- src/mds/MDCache.h | 37 ++++- src/mds/MDLog.cc | 2 +- src/mds/Migrator.cc | 4 +- src/mds/Server.cc | 19 +-- src/mds/Server.h | 2 +- 14 files changed, 188 insertions(+), 104 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 51b5b3b7f989b..c730f7ac44b02 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,2 +1,11 @@ >= 12.2.0 --------- + +- *CephFS*: + + * Limiting MDS cache via a memory limit is now supported using the new + mds_cache_memory_limit config option (1GB by default). A cache reservation + can also be specified using mds_cache_reservation as a percentage of the + limit (5% by default). Limits by inode count are still supported using + mds_cache_size. Setting mds_cache_size to 0 (the default) disables the + inode limit. diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst index 5e9f796787b59..adaafb842e201 100644 --- a/doc/cephfs/health-messages.rst +++ b/doc/cephfs/health-messages.rst @@ -73,14 +73,14 @@ so at all. This message appears if a client has taken longer than Message: "Client *name* failing to respond to cache pressure" Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY -Description: Clients maintain a metadata cache. Items (such as inodes) -in the client cache are also pinned in the MDS cache, so when the MDS -needs to shrink its cache (to stay within ``mds_cache_size``), it -sends messages to clients to shrink their caches too. If the client -is unresponsive or buggy, this can prevent the MDS from properly staying -within its ``mds_cache_size`` and it may eventually run out of memory -and crash. This message appears if a client has taken more than -``mds_recall_state_timeout`` (default 60s) to comply. +Description: Clients maintain a metadata cache. Items (such as inodes) in the +client cache are also pinned in the MDS cache, so when the MDS needs to shrink +its cache (to stay within ``mds_cache_size`` or ``mds_cache_memory_limit``), it +sends messages to clients to shrink their caches too. If the client is +unresponsive or buggy, this can prevent the MDS from properly staying within +its cache limits and it may eventually run out of memory and crash. This +message appears if a client has taken more than ``mds_recall_state_timeout`` +(default 60s) to comply. Message: "Client *name* failing to advance its oldest client/flush tid" Code: MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY @@ -121,9 +121,9 @@ This message appears if any client requests have taken longer than Message: "Too many inodes in cache" Code: MDS_HEALTH_CACHE_OVERSIZED -Description: The MDS is not succeeding in trimming its cache to comply -with the limit set by the administrator. If the MDS cache becomes too large, -the daemon may exhaust available memory and crash. -This message appears if the actual cache size (in inodes) is at least 50% -greater than ``mds_cache_size`` (default 100000). - +Description: The MDS is not succeeding in trimming its cache to comply with the +limit set by the administrator. If the MDS cache becomes too large, the daemon +may exhaust available memory and crash. By default, this message appears if +the actual cache size (in inodes or memory) is at least 50% greater than +``mds_cache_size`` (default 100000) or ``mds_cache_memory_limit`` (default +1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio. diff --git a/doc/cephfs/mds-config-ref.rst b/doc/cephfs/mds-config-ref.rst index b3446d698dbba..4f7bea3ef8585 100644 --- a/doc/cephfs/mds-config-ref.rst +++ b/doc/cephfs/mds-config-ref.rst @@ -19,13 +19,30 @@ :Type: 64-bit Integer Unsigned :Default: ``1ULL << 40`` +``mds cache memory limit`` + +:Description: The memory limit the MDS should enforce for its cache. + Administrators should use this instead of ``mds cache size``. +:Type: 64-bit Integer Unsigned +:Default: ``1073741824`` + +``mds cache reservation`` + +:Description: The cache reservation (memory or inodes) for the MDS cache to maintain. + Once the MDS begins dipping into its reservation, it will recall + client state until its cache size shrinks to restore the + reservation. +:Type: Float +:Default: ``0.05`` ``mds cache size`` -:Description: The number of inodes to cache. +:Description: The number of inodes to cache. A value of 0 indicates an + unlimited number. It is recommended to use + ``mds_cache_memory_limit`` to limit the amount of memory the MDS + cache uses. :Type: 32-bit Integer -:Default: ``100000`` - +:Default: ``0`` ``mds cache mid`` diff --git a/qa/suites/fs/basic_functional/tasks/client-limits.yaml b/qa/suites/fs/basic_functional/tasks/client-limits.yaml index 754b1d33f412b..635d0b6d82565 100644 --- a/qa/suites/fs/basic_functional/tasks/client-limits.yaml +++ b/qa/suites/fs/basic_functional/tasks/client-limits.yaml @@ -9,6 +9,7 @@ overrides: - failing to respond to cache pressure - slow requests are blocked - failing to respond to capability release + - MDS cache is too large - \(MDS_CLIENT_OLDEST_TID\) - \(MDS_CACHE_OVERSIZED\) diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index 3f8ffa8758f79..b06d2a1d233fa 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -81,12 +81,12 @@ class TestClientLimits(CephFSTestCase): pass # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message, - # which depend on the cache size and overall ratio + # which depend on the caps outstanding, cache size and overall ratio self.wait_until_equal( lambda: self.get_session(mount_a_client_id)['num_caps'], - int(cache_size * 0.8), - timeout=600, - reject_fn=lambda x: x < int(cache_size*.8)) + int(open_files * 0.2), + timeout=30, + reject_fn=lambda x: x < int(open_files*0.2)) @needs_trimming def test_client_pin_root(self): diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index 1a846818ff0b4..00ffae7a6139f 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -438,8 +438,6 @@ OPTION(mds_data, OPT_STR) OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size ' afterwards // max xattr kv pairs size for each dir/file OPTION(mds_max_xattr_pairs_size, OPT_U32) -OPTION(mds_cache_size, OPT_INT) -OPTION(mds_cache_mid, OPT_FLOAT) OPTION(mds_max_file_recover, OPT_U32) OPTION(mds_dir_max_commit_size, OPT_INT) // MB OPTION(mds_dir_keys_per_op, OPT_INT) @@ -459,7 +457,6 @@ OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't tr OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many' -OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart // make it (mds_session_timeout - mds_beacon_grace) OPTION(mds_tick_interval, OPT_FLOAT) diff --git a/src/common/options.cc b/src/common/options.cc index e6c678b3330a3..8d625a854e976 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -5309,8 +5309,22 @@ std::vector