OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin
OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300) // max # pgs per (in) osd before we warn the admin
OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
ceph::unordered_map<pg_t, pg_stat_t> stuck_pgs;
utime_t now(ceph_clock_now(g_ceph_context));
utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0);
+ uint64_t num_inactive_pgs = 0;
pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs);
if (!stuck_pgs.empty()) {
note["stuck inactive"] = stuck_pgs.size();
+ num_inactive_pgs += stuck_pgs.size();
if (detail)
note_stuck_detail(PGMap::STUCK_INACTIVE, stuck_pgs, detail);
}
pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs);
if (!stuck_pgs.empty()) {
note["stuck stale"] = stuck_pgs.size();
+ num_inactive_pgs += stuck_pgs.size();
if (detail)
note_stuck_detail(PGMap::STUCK_STALE, stuck_pgs, detail);
}
+ if (g_conf->mon_pg_min_inactive > 0 && num_inactive_pgs >= g_conf->mon_pg_min_inactive) {
+ ostringstream ss;
+ ss << num_inactive_pgs << " pgs are stuck inactive for more than " << g_conf->mon_pg_stuck_threshold << " seconds";
+ summary.push_back(make_pair(HEALTH_ERR, ss.str()));
+ }
+
if (!note.empty()) {
for (map<string,int>::iterator p = note.begin(); p != note.end(); ++p) {
ostringstream ss;