Dumping huge caches (~ 1G) to formatter causes mds to
hang or get terminated. Until the underlying issues is
fixed, disallow dumping cache if cache usage exceeds this
threshold. Also, Patrick feels a similar issue might be
hiding there for cache dumps to file when cache sizes
are really huge. This case has not been been reproduced
yet, hence, the limit for cache dumps to file is currently
kept not capped.
Fixes: http://tracker.ceph.com/issues/37608
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit
6be2ce98c629d2cfe9d8443659e59600148b7675)
Conflicts:
src/common/options.cc
src/mds/MDCache.cc
Fixed the config option type (`TYPE_UINT` instead of `TYPE_SIZE`)
as commit
7c25de3 is not in luminous (yet).
Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
+
+ Option("mds_dump_cache_threshold_formatter", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(1_G)
+ .set_description("threshold for cache usage to disallow \"dump cache\" operation to formatter")
+ .set_long_description("Disallow MDS from dumping caches to formatter via \"dump cache\" command if cache usage exceeds this threshold."),
+
+ Option("mds_dump_cache_threshold_file", Option::TYPE_UINT, Option::LEVEL_DEV)
+ .set_default(0)
+ .set_description("threshold for cache usage to disallow \"dump cache\" operation to file")
+ .set_long_description("Disallow MDS from dumping caches to file via \"dump cache\" command if cache usage exceeds this threshold."),
});
}
boost::string_view dump_root, int depth)
{
int r = 0;
+
+ // dumping large caches may cause mds to hang or worse get killed.
+ // so, disallow the dump if the cache size exceeds the configured
+ // threshold, which is 1G for formatter and unlimited for file (note
+ // that this can be jacked up by the admin... and is nothing but foot
+ // shooting, but the option itself is for devs and hence dangerous to
+ // tune). TODO: remove this when fixed.
+ uint64_t threshold = f ?
+ g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
+ g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
+
+ if (threshold && cache_size() > threshold) {
+ if (f) {
+ std::stringstream ss;
+ ss << "cache usage exceeds dump threshold";
+ f->open_object_section("result");
+ f->dump_string("error", ss.str());
+ f->close_section();
+ } else {
+ derr << "cache usage exceeds dump threshold" << dendl;
+ r = -EINVAL;
+ }
+ return r;
+ }
+
+ r = 0;
int fd = -1;
if (f) {
"mds_cache_reservation",
"mds_health_cache_threshold",
"mds_cache_mid",
+ "mds_dump_cache_threshold_formatter",
+ "mds_dump_cache_threshold_file",
// MDBalancer
"mds_bal_fragment_interval",
// PurgeQueue