>= 12.2.0
---------
+
+- *CephFS*:
+
+ * Limiting MDS cache via a memory limit is now supported using the new
+ mds_cache_memory_limit config option (1GB by default). A cache reservation
+ can also be specified using mds_cache_reservation as a percentage of the
+ limit (5% by default). Limits by inode count are still supported using
+ mds_cache_size. Setting mds_cache_size to 0 (the default) disables the
+ inode limit.
Message: "Client *name* failing to respond to cache pressure"
Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
-Description: Clients maintain a metadata cache. Items (such as inodes)
-in the client cache are also pinned in the MDS cache, so when the MDS
-needs to shrink its cache (to stay within ``mds_cache_size``), it
-sends messages to clients to shrink their caches too. If the client
-is unresponsive or buggy, this can prevent the MDS from properly staying
-within its ``mds_cache_size`` and it may eventually run out of memory
-and crash. This message appears if a client has taken more than
-``mds_recall_state_timeout`` (default 60s) to comply.
+Description: Clients maintain a metadata cache. Items (such as inodes) in the
+client cache are also pinned in the MDS cache, so when the MDS needs to shrink
+its cache (to stay within ``mds_cache_size`` or ``mds_cache_memory_limit``), it
+sends messages to clients to shrink their caches too. If the client is
+unresponsive or buggy, this can prevent the MDS from properly staying within
+its cache limits and it may eventually run out of memory and crash. This
+message appears if a client has taken more than ``mds_recall_state_timeout``
+(default 60s) to comply.
Message: "Client *name* failing to advance its oldest client/flush tid"
Code: MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY
Message: "Too many inodes in cache"
Code: MDS_HEALTH_CACHE_OVERSIZED
-Description: The MDS is not succeeding in trimming its cache to comply
-with the limit set by the administrator. If the MDS cache becomes too large,
-the daemon may exhaust available memory and crash.
-This message appears if the actual cache size (in inodes) is at least 50%
-greater than ``mds_cache_size`` (default 100000).
-
+Description: The MDS is not succeeding in trimming its cache to comply with the
+limit set by the administrator. If the MDS cache becomes too large, the daemon
+may exhaust available memory and crash. By default, this message appears if
+the actual cache size (in inodes or memory) is at least 50% greater than
+``mds_cache_size`` (default 100000) or ``mds_cache_memory_limit`` (default
+1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio.
:Type: 64-bit Integer Unsigned
:Default: ``1ULL << 40``
+``mds cache memory limit``
+
+:Description: The memory limit the MDS should enforce for its cache.
+ Administrators should use this instead of ``mds cache size``.
+:Type: 64-bit Integer Unsigned
+:Default: ``1073741824``
+
+``mds cache reservation``
+
+:Description: The cache reservation (memory or inodes) for the MDS cache to maintain.
+ Once the MDS begins dipping into its reservation, it will recall
+ client state until its cache size shrinks to restore the
+ reservation.
+:Type: Float
+:Default: ``0.05``
``mds cache size``
-:Description: The number of inodes to cache.
+:Description: The number of inodes to cache. A value of 0 indicates an
+ unlimited number. It is recommended to use
+ ``mds_cache_memory_limit`` to limit the amount of memory the MDS
+ cache uses.
:Type: 32-bit Integer
-:Default: ``100000``
-
+:Default: ``0``
``mds cache mid``
- failing to respond to cache pressure
- slow requests are blocked
- failing to respond to capability release
+ - MDS cache is too large
- \(MDS_CLIENT_OLDEST_TID\)
- \(MDS_CACHE_OVERSIZED\)
pass
# The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
- # which depend on the cache size and overall ratio
+ # which depend on the caps outstanding, cache size and overall ratio
self.wait_until_equal(
lambda: self.get_session(mount_a_client_id)['num_caps'],
- int(cache_size * 0.8),
- timeout=600,
- reject_fn=lambda x: x < int(cache_size*.8))
+ int(open_files * 0.2),
+ timeout=30,
+ reject_fn=lambda x: x < int(open_files*0.2))
@needs_trimming
def test_client_pin_root(self):
OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
// max xattr kv pairs size for each dir/file
OPTION(mds_max_xattr_pairs_size, OPT_U32)
-OPTION(mds_cache_size, OPT_INT)
-OPTION(mds_cache_mid, OPT_FLOAT)
OPTION(mds_max_file_recover, OPT_U32)
OPTION(mds_dir_max_commit_size, OPT_INT) // MB
OPTION(mds_dir_keys_per_op, OPT_INT)
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
-OPTION(mds_health_cache_threshold, OPT_FLOAT) // warn on cache size if it exceeds mds_cache_size by this factor
OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
// make it (mds_session_timeout - mds_beacon_grace)
OPTION(mds_tick_interval, OPT_FLOAT)
.set_description(""),
Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
- .set_default(100000)
- .set_description(""),
+ .set_default(0)
+ .set_description("maximum number of inodes in MDS cache (<=0 is unlimited)")
+ .set_long_description("This tunable is no longer recommended. Use mds_cache_memory_limit."),
+
+ Option("mds_cache_memory_limit", Option::TYPE_UINT, Option::LEVEL_BASIC)
+ .set_default(1*(1LL<<30))
+ .set_description("target maximum memory usage of MDS cache")
+ .set_long_description("This sets a target maximum memory usage of the MDS cache and is the primary tunable to limit the MDS memory usage. The MDS will try to stay under a reservation of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata in its cache and recalling cached items in the client caches. It is possible for the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold (150%) sets a cache full threshold for when the MDS signals a cluster health warning."),
+
+ Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(.05)
+ .set_description("amount of memory to reserve"),
+
+ Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(1.5)
+ .set_description("threshold for cache size to generate health warning"),
Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.7)
.set_default(10)
.set_description(""),
- Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(1.5)
- .set_description(""),
-
Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(45)
.set_description(""),
#include "common/dout.h"
#include "common/HeartbeatMap.h"
+
#include "include/stringify.h"
#include "include/util.h"
}
// Report if we have significantly exceeded our cache size limit
- if (mds->mdcache->get_cache_size() >
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (mds->mdcache->cache_overfull()) {
std::ostringstream oss;
- oss << "Too many inodes in cache (" << mds->mdcache->get_cache_size()
- << "/" << g_conf->mds_cache_size << "), "
+ oss << "MDS cache is too large (" << bytes2str(mds->mdcache->cache_size())
+ << "/" << bytes2str(mds->mdcache->cache_limit_memory()) << "); "
<< mds->mdcache->num_inodes_with_caps << " inodes in use by clients, "
<< mds->mdcache->get_num_strays() << " stray files";
#include "include/ceph_fs.h"
#include "include/filepath.h"
+#include "include/util.h"
#include "msg/Message.h"
#include "msg/Messenger.h"
+#include "common/MemoryModel.h"
#include "common/errno.h"
-#include "common/safe_io.h"
#include "common/perf_counters.h"
-#include "common/MemoryModel.h"
+#include "common/safe_io.h"
+
#include "osdc/Journaler.h"
#include "osdc/Filer.h"
cap_imports_num_opening = 0;
opening_root = open = false;
- lru.lru_set_midpoint(g_conf->mds_cache_mid);
+ lru.lru_set_midpoint(cache_mid());
bottom_lru.lru_set_midpoint(0);
void MDCache::log_stat()
{
- mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
+ mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
mds->logger->set(l_mds_inodes, lru.lru_get_size());
mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
base_inodes.insert(in);
}
- if (CInode::count() >
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (cache_toofull()) {
exceeded_size_limit = true;
}
}
// ================================================================================
// cache trimming
-
-/*
- * note: only called while MDS is active or stopping... NOT during recovery.
- * however, we may expire a replica whose authority is recovering.
- *
- */
-bool MDCache::trim(int max, int count)
-{
- // trim LRU
- if (count > 0) {
- max = lru.lru_get_size() - count;
- if (max <= 0)
- max = 1;
- } else if (max < 0) {
- max = g_conf->mds_cache_size;
- if (max <= 0)
- return false;
- }
- dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size()
- << "/" << bottom_lru.lru_get_size() << dendl;
-
- // process delayed eval_stray()
- stray_manager.advance_delayed();
-
- map<mds_rank_t, MCacheExpire*> expiremap;
+void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
+{
bool is_standby_replay = mds->is_standby_replay();
- int unexpirable = 0;
- list<CDentry*> unexpirables;
+ std::vector<CDentry *> unexpirables;
+ uint64_t trimmed = 0;
+
+ dout(7) << "trim_lru trimming " << count
+ << " items from LRU"
+ << " size=" << lru.lru_get_size()
+ << " mid=" << lru.lru_get_top()
+ << " pintail=" << lru.lru_get_pintail()
+ << " pinned=" << lru.lru_get_num_pinned()
+ << dendl;
for (;;) {
CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
break;
if (trim_dentry(dn, expiremap)) {
unexpirables.push_back(dn);
- ++unexpirable;
+ } else {
+ trimmed++;
}
}
- for(auto dn : unexpirables)
+ for (auto &dn : unexpirables) {
bottom_lru.lru_insert_mid(dn);
+ }
unexpirables.clear();
- // trim dentries from the LRU: only enough to satisfy `max`,
- while (lru.lru_get_size() + unexpirable > (unsigned)max) {
+ // trim dentries from the LRU until count is reached
+ while (cache_toofull() || count > 0) {
CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
if (!dn) {
break;
}
if ((is_standby_replay && dn->get_linkage()->inode &&
- dn->get_linkage()->inode->item_open_file.is_on_list()) ||
- trim_dentry(dn, expiremap)) {
+ dn->get_linkage()->inode->item_open_file.is_on_list())) {
unexpirables.push_back(dn);
- ++unexpirable;
+ } else if (trim_dentry(dn, expiremap)) {
+ unexpirables.push_back(dn);
+ } else {
+ trimmed++;
}
+ count--;
}
- for(auto dn : unexpirables)
+
+ for (auto &dn : unexpirables) {
lru.lru_insert_mid(dn);
+ }
unexpirables.clear();
+ dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+}
+
+/*
+ * note: only called while MDS is active or stopping... NOT during recovery.
+ * however, we may expire a replica whose authority is recovering.
+ *
+ * @param count is number of dentries to try to expire
+ */
+bool MDCache::trim(uint64_t count)
+{
+ uint64_t used = cache_size();
+ uint64_t limit = cache_limit_memory();
+ map<mds_rank_t, MCacheExpire*> expiremap;
+
+ dout(7) << "trim bytes_used=" << bytes2str(used)
+ << " limit=" << bytes2str(limit)
+ << " reservation=" << cache_reservation()
+ << "% count=" << count << dendl;
+
+ // process delayed eval_stray()
+ stray_manager.advance_delayed();
+
+ trim_lru(count, expiremap);
+
// trim non-auth, non-bound subtrees
- for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
- p != subtrees.end();) {
+ for (auto p = subtrees.begin(); p != subtrees.end();) {
CDir *dir = p->first;
++p;
CInode *diri = dir->get_inode();
}
// trim root?
- if (max == 0 && root) {
+ if (mds->is_stopping() && root) {
list<CDir*> ls;
root->get_dirfrags(ls);
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
}
// Other rank's base inodes (when I'm stopping)
- if (max == 0) {
+ if (mds->is_stopping()) {
for (set<CInode*>::iterator p = base_inodes.begin();
p != base_inodes.end(); ++p) {
if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
// check client caps
assert(CInode::count() == inode_map.size() + snap_inode_map.size());
- float caps_per_inode = 0.0;
+ double caps_per_inode = 0.0;
if (CInode::count())
- caps_per_inode = (float)Capability::count() / (float)CInode::count();
+ caps_per_inode = (double)Capability::count() / (double)CInode::count();
dout(2) << "check_memory_usage"
<< " total " << last.get_total()
mds->mlogger->set(l_mdm_rss, last.get_rss());
mds->mlogger->set(l_mdm_heap, last.get_heap());
- if (num_inodes_with_caps > g_conf->mds_cache_size) {
- float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
- if (ratio < 1.0) {
- last_recall_state = ceph_clock_now();
- mds->server->recall_client_state(ratio);
- }
+ if (cache_toofull()) {
+ last_recall_state = ceph_clock_now();
+ mds->server->recall_client_state();
}
// If the cache size had exceeded its limit, but we're back in bounds
// now, free any unused pool memory so that our memory usage isn't
// permanently bloated.
- if (exceeded_size_limit
- && CInode::count() <=
- g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
+ if (exceeded_size_limit && !cache_toofull()) {
// Only do this once we are back in bounds: otherwise the releases would
// slow down whatever process caused us to exceed bounds to begin with
if (ceph_using_tcmalloc()) {
}
// trim cache
- trim(0);
+ trim(UINT64_MAX);
dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
// SUBTREES
bool exceeded_size_limit;
public:
+ static uint64_t cache_limit_inodes(void) {
+ return g_conf->get_val<int64_t>("mds_cache_size");
+ }
+ static uint64_t cache_limit_memory(void) {
+ return g_conf->get_val<uint64_t>("mds_cache_memory_limit");
+ }
+ static double cache_reservation(void) {
+ return g_conf->get_val<double>("mds_cache_reservation");
+ }
+ static double cache_mid(void) {
+ return g_conf->get_val<double>("mds_cache_mid");
+ }
+ static double cache_health_threshold(void) {
+ return g_conf->get_val<double>("mds_health_cache_threshold");
+ }
+ double cache_toofull_ratio(void) const {
+ uint64_t inode_limit = cache_limit_inodes();
+ double inode_reserve = inode_limit*(1.0-cache_reservation());
+ double memory_reserve = cache_limit_memory()*(1.0-cache_reservation());
+ return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
+ }
+ bool cache_toofull(void) const {
+ return cache_toofull_ratio() > 0.0;
+ }
+ uint64_t cache_size(void) const {
+ return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
+ }
+ bool cache_overfull(void) const {
+ uint64_t inode_limit = cache_limit_inodes();
+ return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold());
+ }
+
void advance_stray() {
stray_index = (stray_index+1)%NUM_STRAY;
}
size_t get_cache_size() { return lru.lru_get_size(); }
// trimming
- bool trim(int max=-1, int count=-1); // trim cache
+ bool trim(uint64_t count=0);
+private:
+ void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
void trim_dirfrag(CDir *dir, CDir *con,
map<mds_rank_t, MCacheExpire*>& expiremap);
map<mds_rank_t,class MCacheExpire*>& expiremap);
void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
void trim_non_auth(); // trim out trimmable non-auth items
+public:
bool trim_non_auth_subtree(CDir *directory);
void standby_trim_segment(LogSegment *ls);
void try_trim_non_auth_subtree(CDir *dir);
if (removed_segment) {
dout(20) << " calling mdcache->trim!" << dendl;
- mds->mdcache->trim(-1);
+ mds->mdcache->trim();
} else {
dout(20) << " removed no segments!" << dendl;
}
cache->show_subtrees();
audit();
- cache->trim(-1, num_dentries); // try trimming exported dentries
+ cache->trim(num_dentries); // try trimming exported dentries
// send pending import_maps?
mds->mdcache->maybe_send_pending_resolves();
// log our failure
mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
- cache->trim(-1, num_dentries); // try trimming dentries
+ cache->trim(num_dentries); // try trimming dentries
// notify bystanders; wait in aborting state
import_notify_abort(dir, bounds);
* to trim some caps, and consequently unpin some inodes in the MDCache so
* that it can trim too.
*/
-void Server::recall_client_state(float ratio)
+void Server::recall_client_state(void)
{
- int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
- int min_caps_per_client = 100;
+ /* try to recall at least 80% of all caps */
+ uint64_t max_caps_per_client = (Capability::count() * .8);
+ uint64_t min_caps_per_client = 100;
+ /* unless this ratio is smaller: */
+ /* ratio: determine the amount of caps to recall from each client. Use
+ * percentage full over the cache reservation. Cap the ratio at 80% of client
+ * caps. */
+ double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
dout(10) << "recall_client_state " << ratio
<< ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
- for (set<Session*>::const_iterator p = sessions.begin();
- p != sessions.end();
- ++p) {
- Session *session = *p;
+ for (auto &session : sessions) {
if (!session->is_open() ||
!session->info.inst.name.is_client())
continue;
<< dendl;
if (session->caps.size() > min_caps_per_client) {
- int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
+ uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
if (session->caps.size() > newlim) {
MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
void reconnect_tick();
void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
- void recall_client_state(float ratio);
+ void recall_client_state(void);
void force_clients_readonly();
// -- requests --