MDS when all of these caps need to be processed during certain session
events. It is recommended to not unnecessarily increase this value.
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+ warnings are now generated based on the number of caps the MDS has recalled
+ which have not been released. The new configs mds_recall_warning_threshold
+ (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+ threshold for this warning.
+
* The `cache drop` admin socket command has been removed. The `ceph tell mds.X
cache drop` remains.
cache_size = open_files/2
self.set_conf('mds', 'mds cache size', cache_size)
+ self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+ self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
self.fs.mds_fail_restart()
self.fs.wait_for_daemons()
mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+ mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
self.assertTrue(open_files >= mds_min_caps_per_client)
mount_a_client_id = self.mount_a.get_global_id()
# MDS should not be happy about that, as the client is failing to comply
# with the SESSION_RECALL messages it is being sent
- mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
- self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+ self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
# We can also test that the MDS health warning for oversized
# cache is functioning as intended.
- self.wait_for_health("MDS_CACHE_OVERSIZED",
- mds_recall_state_timeout + 10)
+ self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
# When the client closes the files, it should retain only as many caps as allowed
# under the SESSION_RECALL policy
OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands
OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart
.set_default(64_K)
.set_description("decay threshold for throttle on recalled caps globally"),
- Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
- .set_default(60)
- .set_description("timeout for clients late on cap recall to create health warnings"),
+ Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+ .set_default(32_K)
+ .set_description("decay threshold for warning on slow session cap recall"),
+
+ Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+ .set_default(60.0)
+ .set_description("decay rate for warning on slow session cap recall"),
Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(30)
{
set<Session*> sessions;
mds->sessionmap.get_client_session_set(sessions);
- auto now = clock::now();
-
- const auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
- const auto last_recall = mds->server->last_recalled();
- const auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
- const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
+ const auto recall_warning_threshold = g_conf->get_val<Option::size_t>("mds_recall_warning_threshold");
+ const auto max_completed_requests = g_conf->mds_max_completed_requests;
+ const auto max_completed_flushes = g_conf->mds_max_completed_flushes;
std::list<MDSHealthMetric> late_recall_metrics;
std::list<MDSHealthMetric> large_completed_requests_metrics;
for (auto& session : sessions) {
- const auto& recall_release_count = session->recall_release_count;
- const auto& recall_count = session->recall_count;
- if (recall_release_count < recall_count) {
- const auto& recalled_at = session->recalled_at;
- const auto& released_at = session->released_at;
- const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
- const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
-
- dout(20) << "Session " << session->info.inst
- << " last released " << recall_release_count << "/" << recall_count << " caps "
- << released_at_span << "s ago " << dendl;
- if (recall_state_timedout) {
- dout(20) << " no longer recall" << dendl;
- session->clear_recalled();
- } else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
- dout(20) << " exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
- std::ostringstream oss;
- oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
- MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
- m.metadata["client_id"] = stringify(session->get_client());
- late_recall_metrics.push_back(m);
- } else {
- dout(20) << " within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
- }
+ const uint64_t recall_caps = session->get_recall_caps();
+ if (recall_caps > recall_warning_threshold) {
+ dout(2) << "Session " << *session <<
+ " is not releasing caps fast enough. Recalled caps at " << recall_caps
+ << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+ std::ostringstream oss;
+ oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+ m.metadata["client_id"] = stringify(session->get_client());
+ late_recall_metrics.push_back(m);
}
if ((session->get_num_trim_requests_warnings() > 0 &&
- session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
+ session->get_num_completed_requests() >= max_completed_requests) ||
(session->get_num_trim_flushes_warnings() > 0 &&
- session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
+ session->get_num_completed_flushes() >= max_completed_flushes)) {
std::ostringstream oss;
oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. ";
MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
failed_reconnects(0),
reconnect_evicting(false),
terminating_sessions(false),
- recall_counter(g_conf->get_val<double>("mds_recall_max_decay_rate"))
+ recall_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
{
cap_revoke_eviction_timeout = g_conf->get_val<double>("mds_cap_revoke_eviction_timeout");
supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
<< cap_revoke_eviction_timeout << dendl;
}
if (changed.count("mds_recall_max_decay_rate")) {
- recall_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+ recall_throttle = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
}
}
const auto recall_max_caps = g_conf->get_val<Option::size_t>("mds_recall_max_caps");
const auto recall_max_decay_threshold = g_conf->get_val<Option::size_t>("mds_recall_max_decay_threshold");
- dout(10) << __func__ << ":"
+ dout(7) << __func__ << ":"
<< " min=" << min_caps_per_client
<< " max=" << max_caps_per_client
<< " total=" << Capability::count()
!session->info.inst.name.is_client())
continue;
- dout(10) << " session " << session->info.inst
+ dout(10) << __func__ << ":"
+ << " session " << session->info.inst
<< " caps " << num_caps
<< ", leases " << session->leases.size()
<< dendl;
/* now limit the number of caps we recall at a time to prevent overloading ourselves */
uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
newlim = num_caps-recall;
- const uint64_t session_recall_counter = session->cap_recalled_counter();
- const uint64_t global_recall_counter = recall_counter.get();
- if (session_recall_counter+recall > recall_max_decay_threshold) {
- dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+ const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+ const uint64_t global_recall_throttle = recall_throttle.get(ceph_clock_now());
+ if (session_recall_throttle+recall > recall_max_decay_threshold) {
+ dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
throttled = true;
continue;
- } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
- dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+ } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+ dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
throttled = true;
break;
}
// now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
- const auto& recalled_at = session->recalled_at;
- auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
- const auto& released_at = session->released_at;
- const auto& last_recall_count = session->recall_count;
- const auto& last_recall_release_count = session->recall_release_count;
- const auto& last_recall_limit = session->recall_limit;
- bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
- if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
- /* The session has recently (1hr) been asked to release caps and we
- * were unable to get at least half of the recalled caps.
- */
- dout(15) << " last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
- << " caps " << last_recalled << "s ago; released "
- << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
- continue;
+ if (steady) {
+ const auto session_recall = session->get_recall_caps();
+ const auto session_release = session->get_release_caps();
+ if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+ /* The session has been unable to keep up with the number of caps
+ * recalled (by half); additionally, to prevent marking sessions
+ * we've just begun to recall from, the session_recall counter
+ * (decayed count of caps recently recalled) is **greater** than the
+ * session threshold for the session's cap recall throttle.
+ */
+ dout(15) << " 2*session_release < session_recall"
+ " (2*" << session_release << " < " << session_recall << ");"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ } else if (recall < recall_max_caps && 2*recall < session_recall) {
+ /* The number of caps recalled is less than the number we *could*
+ * recall (so there isn't much left to recall?) and the number of
+ * caps is less than the current recall_caps counter (decayed count
+ * of caps recently recalled).
+ */
+ dout(15) << " 2*recall < session_recall "
+ " (2*" << recall << " < " << session_recall << ") &&"
+ " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+ " Skipping because we are unlikely to get more released." << dendl;
+ continue;
+ }
}
+ dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
m->head.max_caps = newlim;
mds->send_message_client(m, session);
flush_session(session, gather);
}
caps_recalled += session->notify_recall_sent(newlim);
- recall_counter.hit(ceph_clock_now(), recall);
+ recall_throttle.hit(ceph_clock_now(), recall);
}
}
void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
void flush_session(Session *session, MDSGatherBuilder *gather);
- DecayCounter recall_counter;
+ DecayCounter recall_throttle;
time last_recall_state;
};
*/
void Session::notify_cap_release(size_t n_caps)
{
- recall_release_count += n_caps;
- if (n_caps > 0) {
- released_at = clock::now();
- if (recall_count <= recall_release_count) {
- clear_recalled();
- }
- }
+ recall_caps.hit(ceph_clock_now(), -(double)n_caps);
+ release_caps.hit(ceph_clock_now(), n_caps);
}
/**
* in order to generate health metrics if the session doesn't see
* a commensurate number of calls to ::notify_cap_release
*/
-uint64_t Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
{
const auto num_caps = caps.size();
- ceph_assert(new_limit < num_caps);
+ ceph_assert(new_limit < num_caps); // Behaviour of Server::recall_client_state
const auto count = num_caps-new_limit;
-
- /* Entering recall phase, set up counters so we can later judge whether the
- * client has respected the recall request. Update only if client has not
- * released caps from a previous recall.
- */
-
uint64_t new_change;
if (recall_limit != new_limit) {
- const auto now = clock::now();
- recalled_at = now;
- assert (new_limit < num_caps); // Behaviour of Server::recall_client_state
- recall_count = count;
- recall_release_count = 0;
- recall_limit = new_limit;
new_change = count;
} else {
new_change = 0; /* no change! */
* session that is not releasing caps (i.e. allow the session counter to
* throttle future RECALL messages).
*/
- cap_recalled.hit(count);
+ recall_caps_throttle.hit(ceph_clock_now(), count);
+ recall_caps.hit(ceph_clock_now(), count);
return new_change;
}
-void Session::clear_recalled()
-{
- recall_count = 0;
- recall_release_count = 0;
- recall_limit = 0;
-}
-
void Session::set_client_metadata(const client_metadata_t& meta)
{
info.client_metadata = meta;
auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
for (const auto &session : *(it->second)) {
- session->cap_recalled = DecayCounter(ceph_clock_now(), d);
+ session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+ }
+ }
+ if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+ }
+ }
+ }
+ if (changed.count("mds_recall_warning_decay_rate")) {
+ auto d = g_conf->get_val<double>("mds_recall_warning_decay_rate");
+ if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+ for (const auto &session : *(it->second)) {
+ session->recall_caps = DecayCounter(ceph_clock_now(), d);
+ session->release_caps = DecayCounter(ceph_clock_now(), d);
}
}
if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
for (const auto &session : *(it->second)) {
- session->cap_recalled = DecayCounter(ceph_clock_now(), d);
+ session->recall_caps = DecayCounter(ceph_clock_now(), d);
+ session->release_caps = DecayCounter(ceph_clock_now(), d);
}
}
}
mutable DecayCounter load_avg;
DecayRate load_avg_rate;
- // caps being recalled recently by this session
- DecayCounter cap_recalled;
+ // Ephemeral state for tracking progress of capability recalls
+ // caps being recalled recently by this session; used for Beacon warnings
+ mutable DecayCounter recall_caps;
+ // caps that have been released
+ mutable DecayCounter release_caps;
+ // throttle on caps recalled
+ mutable DecayCounter recall_caps_throttle;
+ // New limit in SESSION_RECALL
+ uint32_t recall_limit = 0;
// session start time -- used to track average session time
// note that this is initialized in the constructor rather
void set_client_metadata(const client_metadata_t& meta);
const std::string& get_human_name() const {return human_name;}
- // Ephemeral state for tracking progress of capability recalls
- time recalled_at = time::min(); // When was I asked to SESSION_RECALL?
- time released_at = time::min(); // When did the session last release caps?
- uint32_t recall_count = 0; // How many caps was I asked to SESSION_RECALL?
- uint32_t recall_release_count = 0; // How many caps have I actually revoked?
- uint32_t recall_limit = 0; // New limit in SESSION_RECALL
-
session_info_t info; ///< durable bits
MDSAuthCaps auth_caps;
interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
void notify_cap_release(size_t n_caps);
- uint64_t notify_recall_sent(const size_t new_limit);
- auto cap_recalled_counter() const {
- return cap_recalled.get(ceph_clock_now());
+ uint64_t notify_recall_sent(size_t new_limit);
+ auto get_recall_caps_throttle() const {
+ return recall_caps_throttle.get(ceph_clock_now());
+ }
+ auto get_recall_caps() const {
+ return recall_caps.get(ceph_clock_now());
+ }
+ auto get_release_caps() const {
+ return release_caps.get(ceph_clock_now());
}
- void clear_recalled();
inodeno_t next_ino() const {
if (info.prealloc_inos.empty())
Session() = delete;
Session(ConnectionRef con) :
- cap_recalled(g_conf->get_val<double>("mds_recall_max_decay_rate")),
+ recall_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+ release_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+ recall_caps_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate")),
birth_time(clock::now()),
auth_caps(g_ceph_context),
item_session_list(this),