via the `ceph tell mds.<foo> cache drop` command or large reductions in the
cache size will no longer cause service unavailability.
+* The CephFS MDS behavior with recalling caps has been significantly improved
+ to not attempt recalling too many caps at once, leading to instability.
+ MDS with a large cache (64GB+) should be more stable.
+
>=13.1.0
--------
tasks:
- exec:
mon.a:
- - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
- "ceph tell mds.* config set mds_min_caps_per_client 1"
- background_exec:
mon.a:
mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
self.assertTrue(open_files >= mds_min_caps_per_client)
- mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
mount_a_client_id = self.mount_a.get_global_id()
path = "subdir/mount_a" if use_subdir else "mount_a"
# The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
# which depend on the caps outstanding, cache size and overall ratio
- recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
def expected_caps():
num_caps = self.get_session(mount_a_client_id)['num_caps']
if num_caps < mds_min_caps_per_client:
raise RuntimeError("client caps fell below min!")
elif num_caps == mds_min_caps_per_client:
return True
- elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+ elif num_caps < cache_size:
return True
else:
return False
.set_default(1024)
.set_description("number of omap keys to read from the SessionMap in one operation"),
+ Option("mds_recall_max_caps", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ .set_default(5000)
+ .set_description("maximum number of caps to recall from client session in single recall"),
+
+ Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(2.5)
+ .set_description("decay rate for throttle on recalled caps on a session"),
+
+ Option("mds_recall_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ .set_default(16_K)
+ .set_description("decay threshold for throttle on recalled caps on a session"),
+
+ Option("mds_recall_global_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ .set_default(64_K)
+ .set_description("decay threshold for throttle on recalled caps globally"),
+
Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(60)
.set_description("timeout for clients late on cap recall to create health warnings"),
.set_default(100)
.set_description("minimum number of capabilities a client may hold"),
- Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
- .set_default(.8)
- .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
-
Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(0)
.set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
{
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
+ auto now = clock::now();
- auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
- auto last_recall = mds->mdcache->last_recall_state;
- auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
- bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
+ const auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
+ const auto last_recall = mds->server->last_recalled();
+ const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
+ const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
std::list<MDSHealthMetric> late_recall_metrics;
std::list<MDSHealthMetric> large_completed_requests_metrics;
for (auto& session : sessions) {
- if (session->recalled_at != Session::clock::zero()) {
- auto last_recall_sent = session->last_recall_sent;
- auto recalled_at = session->recalled_at;
- auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
- dout(20) << "Session servicing RECALL " << session->info.inst
- << ": " << recalled_at_span << "s ago " << session->recall_release_count
- << "/" << session->recall_count << dendl;
- if (recall_state_timedout || last_recall_sent < last_recall) {
+ const auto& recall_release_count = session->recall_release_count;
+ const auto& recall_count = session->recall_count;
+ if (recall_release_count < recall_count) {
+ const auto& recalled_at = session->recalled_at;
+ const auto& released_at = session->released_at;
+ const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
+ const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
+
+ dout(20) << "Session " << session->info.inst
+ << " last released " << recall_release_count << "/" << recall_count << " caps "
+ << released_at_span << "s ago " << dendl;
+ if (recall_state_timedout) {
dout(20) << " no longer recall" << dendl;
- session->clear_recalled_at();
- } else if (recalled_at_span > mds_recall_state_timeout) {
- dout(20) << " exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+ session->clear_recalled();
+ } else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
+ dout(20) << " exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
std::ostringstream oss;
oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
m.metadata["client_id"] = stringify(session->get_client());
late_recall_metrics.push_back(m);
} else {
- dout(20) << " within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+ dout(20) << " within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
}
}
if ((session->get_num_trim_requests_warnings() > 0 &&
mds->mlogger->set(l_mdm_heap, last.get_heap());
if (cache_toofull()) {
- last_recall_state = clock::now();
- mds->server->recall_client_state(-1.0, false, nullptr);
+ mds->server->recall_client_state(nullptr);
}
// If the cache size had exceeded its limit, but we're back in bounds
void trim_client_leases();
void check_memory_usage();
- time last_recall_state;
-
// shutdown
private:
set<inodeno_t> shutdown_exporting_strays;
"mds_inject_migrator_session_race",
"host",
"fsid",
- "mds_request_load_average_decay_rate",
"mds_cap_revoke_eviction_timeout",
+ // SessionMap
+ "mds_request_load_average_decay_rate",
+ "mds_recall_max_decay_rate",
NULL
};
return KEYS;
Formatter *f, Context *on_finish)
: MDSInternalContext(mds),
server(server), mdcache(mdcache), mdlog(mdlog),
- recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+ recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+ f(f), on_finish(on_finish),
whoami(mds->whoami), incarnation(mds->incarnation) {
}
f->open_object_section("result");
MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
- server->recall_client_state(1.0, true, gather);
+ auto [throttled, count] = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+ dout(10) << __func__
+ << (throttled ? " (throttled)" : "")
+ << " recalled " << count << " caps" << dendl;
+
if (!gather->has_subs()) {
- handle_recall_client_state(0);
delete gather;
- return;
+ return handle_recall_client_state(0);
}
C_ContextTimeout *ctx = new C_ContextTimeout(
MDCache *mdcache;
MDLog *mdlog;
uint64_t recall_timeout;
+ mono_time recall_start;
Formatter *f;
Context *on_finish;
#include <boost/config/warning_disable.hpp>
#include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
#include "MDSRank.h"
#include "Server.h"
#include "osd/OSDMap.h"
#include <errno.h>
+#include <math.h>
#include <list>
#include <iostream>
reconnect_done(NULL),
failed_reconnects(0),
reconnect_evicting(false),
- terminating_sessions(false)
+ terminating_sessions(false),
+ recall_counter(g_conf().get_val<double>("mds_recall_max_decay_rate"))
{
supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
}
dout(20) << __func__ << " cap revoke eviction timeout changed to "
<< cap_revoke_eviction_timeout << dendl;
}
+ if (changed.count("mds_recall_max_decay_rate")) {
+ recall_counter = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+ }
}
/*
}
}
-
/**
* Call this when the MDCache is oversized, to send requests to the clients
* to trim some caps, and consequently unpin some inodes in the MDCache so
* that it can trim too.
*/
-void Server::recall_client_state(double ratio, bool flush_client_session,
- MDSGatherBuilder *gather) {
- if (flush_client_session) {
- assert(gather != nullptr);
- }
-
- /* try to recall at least 80% of all caps */
- uint64_t max_caps_per_client = Capability::count() * g_conf().get_val<double>("mds_max_ratio_caps_per_client");
- uint64_t min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
- if (max_caps_per_client < min_caps_per_client) {
- dout(0) << "max_caps_per_client " << max_caps_per_client
- << " < min_caps_per_client " << min_caps_per_client << dendl;
- max_caps_per_client = min_caps_per_client + 1;
- }
-
- /* unless this ratio is smaller: */
- /* ratio: determine the amount of caps to recall from each client. Use
- * percentage full over the cache reservation. Cap the ratio at 80% of client
- * caps. */
- if (ratio < 0.0)
- ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
- dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
- << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
- set<Session*> sessions;
- mds->sessionmap.get_client_session_set(sessions);
-
- for (auto &session : sessions) {
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+ const auto now = clock::now();
+ const bool steady = flags&RecallFlags::STEADY;
+
+ const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+ const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+ const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+ const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
+
+ dout(10) << __func__ << ": caps per client " << min_caps_per_client << "/" << Capability::count() << dendl;
+
+ /* trim caps of sessions with the most caps first */
+ std::multimap<uint64_t, Session*> caps_session;
+ auto f = [&caps_session](auto& s) {
+ caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(s->caps.size()), std::forward_as_tuple(s));
+ };
+ mds->sessionmap.get_client_sessions(std::move(f));
+
+ std::pair<bool, uint64_t> result = {false, 0};
+ auto& [throttled, caps_recalled] = result;
+ last_recall_state = now;
+ for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
if (!session->is_open() ||
!session->get_connection() ||
!session->info.inst.name.is_client())
continue;
dout(10) << " session " << session->info.inst
- << " caps " << session->caps.size()
+ << " caps " << num_caps
<< ", leases " << session->leases.size()
<< dendl;
- uint64_t newlim = std::max(std::min<uint64_t>((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
- if (session->caps.size() > newlim) {
+ uint64_t newlim;
+ if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+ newlim = min_caps_per_client;
+ } else {
+ newlim = num_caps-recall_max_caps;
+ }
+ if (num_caps > newlim) {
+ /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+ uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+ newlim = num_caps-recall;
+ const uint64_t session_recall_counter = session->cap_recalled_counter();
+ const uint64_t global_recall_counter = recall_counter.get();
+ if (session_recall_counter+recall > recall_max_decay_threshold) {
+ dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+ throttled = true;
+ continue;
+ } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
+ dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+ throttled = true;
+ break;
+ }
+
+ // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+ const auto& recalled_at = session->recalled_at;
+ auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
+ const auto& released_at = session->released_at;
+ const auto& last_recall_count = session->recall_count;
+ const auto& last_recall_release_count = session->recall_release_count;
+ const auto& last_recall_limit = session->recall_limit;
+ bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
+ if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
+ /* The session has recently (1hr) been asked to release caps and we
+ * were unable to get at least half of the recalled caps.
+ */
+ dout(15) << " last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
+ << " caps " << last_recalled << "s ago; released "
+ << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ }
+
auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
mds->send_message_client(m, session);
- if (flush_client_session) {
+ if (gather) {
flush_session(session, gather);
}
session->notify_recall_sent(newlim);
+ recall_counter.hit(recall);
+ caps_recalled += recall;
}
}
+
+ dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+ return result;
}
void Server::force_clients_readonly()
#include <string_view>
+#include <common/DecayCounter.h>
+
#include "messages/MClientReconnect.h"
#include "messages/MClientReply.h"
#include "messages/MClientRequest.h"
bool waiting_for_reconnect(client_t c) const;
void dump_reconnect_status(Formatter *f) const;
+ time last_recalled() const {
+ return last_recall_state;
+ }
+
void handle_client_session(const MClientSession::const_ref &m);
void _session_logged(Session *session, uint64_t state_seq,
bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
void reconnect_tick();
void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
- void recall_client_state(double ratio, bool flush_client_session,
- MDSGatherBuilder *gather);
+ enum RecallFlags {
+ NONE = 0,
+ STEADY = (1<<0),
+ };
+ std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
void force_clients_readonly();
// -- requests --
private:
void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
void flush_session(Session *session, MDSGatherBuilder *gather);
+
+ DecayCounter recall_counter;
+ time last_recall_state;
};
#endif
*/
void Session::notify_cap_release(size_t n_caps)
{
- if (recalled_at != clock::zero()) {
- recall_release_count += n_caps;
- if (recall_release_count >= recall_count)
- clear_recalled_at();
+ recall_release_count += n_caps;
+ if (n_caps > 0) {
+ released_at = clock::now();
+ if (recall_count <= recall_release_count) {
+ clear_recalled();
+ }
}
}
*/
void Session::notify_recall_sent(const size_t new_limit)
{
- if (recalled_at == clock::zero()) {
- // Entering recall phase, set up counters so we can later
- // judge whether the client has respected the recall request
- recalled_at = last_recall_sent = clock::now();
- assert (new_limit < caps.size()); // Behaviour of Server::recall_client_state
- recall_count = caps.size() - new_limit;
+ const auto num_caps = caps.size();
+ const auto count = num_caps-new_limit;
+
+ /* Entering recall phase, set up counters so we can later judge whether the
+ * client has respected the recall request. Update only if client has not
+ * released caps from a previous recall.
+ */
+
+ if (recall_limit != new_limit) {
+ const auto now = clock::now();
+ recalled_at = now;
+ assert (new_limit < num_caps); // Behaviour of Server::recall_client_state
+ recall_count = count;
recall_release_count = 0;
- } else {
- last_recall_sent = clock::now();
+ recall_limit = new_limit;
}
+
+ /* Always hit the session counter as a RECALL message is still sent to the
+ * client and we do not want the MDS to burn its global counter tokens on a
+ * session that is not releasing caps (i.e. allow the session counter to
+ * throttle future RECALL messages).
+ */
+ cap_recalled.hit(count);
}
-void Session::clear_recalled_at()
+void Session::clear_recalled()
{
- recalled_at = last_recall_sent = clock::zero();
recall_count = 0;
recall_release_count = 0;
+ recall_limit = 0;
}
/**
}
void SessionMap::handle_conf_change(const ConfigProxy &conf,
- const std::set <std::string> &changed) {
- if (changed.count("mds_request_load_average_decay_rate")) {
- decay_rate = g_conf().get_val<double>("mds_request_load_average_decay_rate");
- dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
-
- total_load_avg = DecayCounter(decay_rate);
-
- auto p = by_state.find(Session::STATE_OPEN);
- if (p != by_state.end()) {
- for (const auto &session : *(p->second)) {
- session->set_load_avg_decay_rate(decay_rate);
+ const std::set <std::string> &changed)
+{
+ auto apply_to_open_sessions = [this](auto f) {
+ if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ f(session);
}
}
- p = by_state.find(Session::STATE_STALE);
- if (p != by_state.end()) {
- for (const auto &session : *(p->second)) {
- session->set_load_avg_decay_rate(decay_rate);
+ if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ f(session);
}
}
+ };
+
+ if (changed.count("mds_request_load_average_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+ dout(20) << __func__ << " decay rate changed to " << d << dendl;
+
+ decay_rate = d;
+ total_load_avg = DecayCounter(d);
+
+ auto mut = [d](auto s) {
+ s->set_load_avg_decay_rate(d);
+ };
+ apply_to_open_sessions(mut);
+ }
+ if (changed.count("mds_recall_max_decay_rate")) {
+ auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
+ auto mut = [d](auto s) {
+ s->cap_recalled = DecayCounter(d);
+ };
+ apply_to_open_sessions(mut);
}
}
#include "mdstypes.h"
#include "mds/MDSAuthCaps.h"
#include "common/perf_counters.h"
+#include "common/DecayCounter.h"
class CInode;
struct MDRequestImpl;
// request load average for this session
DecayCounter load_avg;
+ // caps being recalled recently by this session
+ DecayCounter cap_recalled;
+
// session start time -- used to track average session time
// note that this is initialized in the constructor rather
// than at the time of adding a session to the sessionmap
const std::string& get_human_name() const {return human_name;}
// Ephemeral state for tracking progress of capability recalls
- time recalled_at = clock::zero(); // When was I asked to SESSION_RECALL?
- time last_recall_sent = clock::zero();
+ time recalled_at = clock::zero(); // When was I first asked to SESSION_RECALL?
+ time released_at = clock::zero(); // When did the session last release caps?
uint32_t recall_count = 0; // How many caps was I asked to SESSION_RECALL?
uint32_t recall_release_count = 0; // How many caps have I actually revoked?
+ uint32_t recall_limit = 0; // New limit in SESSION_RECALL
session_info_t info; ///< durable bits
void notify_cap_release(size_t n_caps);
void notify_recall_sent(const size_t new_limit);
- void clear_recalled_at();
+ auto cap_recalled_counter() const {
+ return cap_recalled.get();
+ }
+ void clear_recalled();
inodeno_t next_ino() const {
if (info.prealloc_inos.empty())
Session() = delete;
Session(ConnectionRef con) :
+ cap_recalled(g_conf().get_val<double>("mds_recall_max_decay_rate")),
birth_time(clock::now()),
auth_caps(g_ceph_context),
item_session_list(this),
void update_average_session_age();
SessionMap() = delete;
- explicit SessionMap(MDSRank *m)
- :
- mds(m)
- {}
+ explicit SessionMap(MDSRank *m) : mds(m) {}
~SessionMap() override
{
void dump();
- void get_client_session_set(set<Session*>& s) const {
- for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
- p != session_map.end();
- ++p)
- if (p->second->info.inst.name.is_client())
- s.insert(p->second);
+ template<typename F>
+ void get_client_sessions(F&& f) const {
+ for (const auto& p : session_map) {
+ auto& session = p.second;
+ if (session->info.inst.name.is_client())
+ f(session);
+ }
+ }
+ template<typename C>
+ void get_client_session_set(C& c) const {
+ auto f = [&c](auto& s) {
+ c.insert(s);
+ };
+ get_client_sessions(f);
}
void replay_open_sessions(map<client_t,entity_inst_t>& client_map,