mon->monmap->get_inst(mon->get_leader()));
}
}
+
+ // OSD_NO_DOWN_OUT_INTERVAL
+ {
+ // Warn if 'mon_osd_down_out_interval' is set to zero.
+ // Having this option set to zero on the leader acts much like the
+ // 'noout' flag. It's hard to figure out what's going wrong with clusters
+ // without the 'noout' flag set but acting like that just the same, so
+ // we report a HEALTH_WARN in case this option is set to zero.
+ // This is an ugly hack to get the warning out, but until we find a way
+ // to spread global options throughout the mon cluster and have all mons
+ // using a base set of the same options, we need to work around this sort
+ // of things.
+ // There's also the obvious drawback that if this is set on a single
+ // monitor on a 3-monitor cluster, this warning will only be shown every
+ // third monitor connection.
+ if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
+ g_conf->mon_osd_down_out_interval == 0) {
+ ostringstream ss, ds;
+ ss << "mon%plurals% %names %hasorhave% mon_osd_down_out_interval set to 0";
+ auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
+ ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
+ d.detail.push_back(ds.str());
+ }
+ }
+
return changed;
}
}
// health
- _check_health(tmp, t);
-}
-
-void OSDMonitor::_check_health(
- const OSDMap& nextmap,
- MonitorDBStore::TransactionRef t)
-{
- dout(20) << __func__ << dendl;
health_check_map_t next;
- int num_osds = osdmap.get_num_osds();
-
- // OSD_DOWN
- // OSD_$subtree_DOWN
- // OSD_ORPHAN
- if (num_osds >= 0) {
- int num_in_osds = 0;
- int num_down_in_osds = 0;
- set<int> osds;
- set<int> down_in_osds;
- set<int> up_in_osds;
- set<int> subtree_up;
- unordered_map<int, set<int> > subtree_type_down;
- unordered_map<int, int> num_osds_subtree;
- int max_type = osdmap.crush->get_max_type_id();
-
- for (int i = 0; i < osdmap.get_max_osd(); i++) {
- if (!osdmap.exists(i)) {
- if (osdmap.crush->item_exists(i)) {
- osds.insert(i);
- }
- continue;
- }
- if (osdmap.is_out(i))
- continue;
- ++num_in_osds;
- if (down_in_osds.count(i) || up_in_osds.count(i))
- continue;
- if (!osdmap.is_up(i)) {
- down_in_osds.insert(i);
- int parent_id = 0;
- int current = i;
- for (int type = 0; type <= max_type; type++) {
- if (!osdmap.crush->get_type_name(type))
- continue;
- int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
- if (r == -ENOENT)
- break;
- // break early if this parent is already marked as up
- if (subtree_up.count(parent_id))
- break;
- type = osdmap.crush->get_bucket_type(parent_id);
- if (!osdmap.subtree_type_is_down(
- g_ceph_context, parent_id, type,
- &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
- break;
- current = parent_id;
- }
- }
- }
-
- // calculate the number of down osds in each down subtree and
- // store it in num_osds_subtree
- for (int type = 1; type <= max_type; type++) {
- if (!osdmap.crush->get_type_name(type))
- continue;
- for (auto j = subtree_type_down[type].begin();
- j != subtree_type_down[type].end();
- ++j) {
- if (type == 1) {
- list<int> children;
- int num = osdmap.crush->get_children(*j, &children);
- num_osds_subtree[*j] = num;
- } else {
- list<int> children;
- int num = 0;
- int num_children = osdmap.crush->get_children(*j, &children);
- if (num_children == 0)
- continue;
- for (auto l = children.begin(); l != children.end(); ++l) {
- if (num_osds_subtree[*l] > 0) {
- num = num + num_osds_subtree[*l];
- }
- }
- num_osds_subtree[*j] = num;
- }
- }
- }
- num_down_in_osds = down_in_osds.size();
- assert(num_down_in_osds <= num_in_osds);
- if (num_down_in_osds > 0) {
- // summary of down subtree types and osds
- for (int type = max_type; type > 0; type--) {
- if (!osdmap.crush->get_type_name(type))
- continue;
- if (subtree_type_down[type].size() > 0) {
- ostringstream ss;
- ss << subtree_type_down[type].size() << " "
- << osdmap.crush->get_type_name(type);
- if (subtree_type_down[type].size() > 1) {
- ss << "s";
- }
- int sum_down_osds = 0;
- for (auto j = subtree_type_down[type].begin();
- j != subtree_type_down[type].end();
- ++j) {
- sum_down_osds = sum_down_osds + num_osds_subtree[*j];
- }
- ss << " (" << sum_down_osds << " osds) down";
- string err = string("OSD_") +
- string(osdmap.crush->get_type_name(type)) + "_DOWN";
- boost::to_upper(err);
- auto& d = next.add(err, HEALTH_WARN, ss.str());
- for (auto j = subtree_type_down[type].rbegin();
- j != subtree_type_down[type].rend();
- ++j) {
- ostringstream ss;
- ss << osdmap.crush->get_type_name(type);
- ss << " ";
- ss << osdmap.crush->get_item_name(*j);
- // at the top level, do not print location
- if (type != max_type) {
- ss << " (";
- ss << osdmap.crush->get_full_location_ordered_string(*j);
- ss << ")";
- }
- int num = num_osds_subtree[*j];
- ss << " (" << num << " osds)";
- ss << " is down";
- d.detail.push_back(ss.str());
- }
- }
- }
- ostringstream ss;
- ss << down_in_osds.size() << " osds down";
- auto& d = next.add("OSD_DOWN", HEALTH_WARN, ss.str());
- for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
- ostringstream ss;
- ss << "osd." << *it << " (";
- ss << osdmap.crush->get_full_location_ordered_string(*it);
- ss << ") is down";
- d.detail.push_back(ss.str());
- }
- }
-
- if (!osds.empty()) {
- ostringstream ss;
- ss << osds.size() << " osds exist in the crush map but not in the osdmap";
- auto& d = next.add("OSD_ORPHAN", HEALTH_WARN, ss.str());
- for (auto osd : osds) {
- ostringstream ss;
- ss << "osd." << osd << " exists in crush map but not in osdmap";
- d.detail.push_back(ss.str());
- }
- }
- }
-
- // OSD_OUT_OF_ORDER_FULL
- {
- // An osd could configure failsafe ratio, to something different
- // but for now assume it is the same here.
- float fsr = g_conf->osd_failsafe_full_ratio;
- if (fsr > 1.0) fsr /= 100;
- float fr = osdmap.get_full_ratio();
- float br = osdmap.get_backfillfull_ratio();
- float nr = osdmap.get_nearfull_ratio();
-
- list<string> detail;
- // These checks correspond to how OSDService::check_full_status() in an OSD
- // handles the improper setting of these values.
- if (br < nr) {
- ostringstream ss;
- ss << "backfillfull_ratio (" << br
- << ") < nearfull_ratio (" << nr << "), increased";
- detail.push_back(ss.str());
- br = nr;
- }
- if (fr < br) {
- ostringstream ss;
- ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
- << "), increased";
- detail.push_back(ss.str());
- fr = br;
- }
- if (fsr < fr) {
- ostringstream ss;
- ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
- << "), increased";
- detail.push_back(ss.str());
- }
- if (!detail.empty()) {
- auto& d = next.add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
- "full ratio(s) out of order");
- d.detail.swap(detail);
- }
- }
-
- // OSD_FULL
- // OSD_NEARFULL
- // OSD_BACKFILLFULL
- // OSD_FAILSAFE_FULL
- {
- set<int> full, backfillfull, nearfull;
- osdmap.get_full_osd_counts(&full, &backfillfull, &nearfull);
- if (full.size()) {
- ostringstream ss;
- ss << full.size() << " full osd(s)";
- auto& d = next.add("OSD_FULL", HEALTH_ERR, ss.str());
- for (auto& i: full) {
- ostringstream ss;
- ss << "osd." << i << " is full";
- d.detail.push_back(ss.str());
- }
- }
- if (backfillfull.size()) {
- ostringstream ss;
- ss << backfillfull.size() << " backfillfull osd(s)";
- auto& d = next.add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
- for (auto& i: backfillfull) {
- ostringstream ss;
- ss << "osd." << i << " is backfill full";
- d.detail.push_back(ss.str());
- }
- }
- if (nearfull.size()) {
- ostringstream ss;
- ss << nearfull.size() << " nearfull osd(s)";
- auto& d = next.add("OSD_NEARFULL", HEALTH_WARN, ss.str());
- for (auto& i: nearfull) {
- ostringstream ss;
- ss << "osd." << i << " is near full";
- d.detail.push_back(ss.str());
- }
- }
- }
-
- // OSD_FLAGS
- {
- // warn about flags
- uint64_t warn_flags =
- CEPH_OSDMAP_FULL |
- CEPH_OSDMAP_PAUSERD |
- CEPH_OSDMAP_PAUSEWR |
- CEPH_OSDMAP_PAUSEREC |
- CEPH_OSDMAP_NOUP |
- CEPH_OSDMAP_NODOWN |
- CEPH_OSDMAP_NOIN |
- CEPH_OSDMAP_NOOUT |
- CEPH_OSDMAP_NOBACKFILL |
- CEPH_OSDMAP_NORECOVER |
- CEPH_OSDMAP_NOSCRUB |
- CEPH_OSDMAP_NODEEP_SCRUB |
- CEPH_OSDMAP_NOTIERAGENT |
- CEPH_OSDMAP_NOREBALANCE;
- if (osdmap.test_flag(warn_flags)) {
- ostringstream ss;
- ss << osdmap.get_flag_string(osdmap.get_flags() & warn_flags)
- << " flag(s) set";
- next.add("OSD_FLAGS", HEALTH_WARN, ss.str());
- }
- }
-
- // OLD_CRUSH_TUNABLES
- if (g_conf->mon_warn_on_legacy_crush_tunables) {
- string min = osdmap.crush->get_min_required_version();
- if (min < g_conf->mon_crush_min_required_version) {
- ostringstream ss;
- ss << "crush map has legacy tunables (require " << min
- << ", min is " << g_conf->mon_crush_min_required_version << ")";
- auto& d = next.add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
- d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
- }
- }
-
- // OLD_CRUSH_STRAW_CALC_VERSION
- if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
- if (osdmap.crush->get_straw_calc_version() == 0) {
- ostringstream ss;
- ss << "crush map has straw_calc_version=0";
- auto& d = next.add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
- d.detail.push_back(
- "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
- }
- }
-
- // OSD_CACHE_NO_HIT_SET
- if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
- list<string> detail;
- for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
- p != osdmap.pools.end();
- ++p) {
- const pg_pool_t& info = p->second;
- if (info.cache_mode_requires_hit_set() &&
- info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
- ostringstream ss;
- ss << "pool '" << osdmap.get_pool_name(p->first)
- << "' with cache_mode " << info.get_cache_mode_name()
- << " needs hit_set_type to be set but it is not";
- detail.push_back(ss.str());
- }
- }
- if (!detail.empty()) {
- ostringstream ss;
- ss << detail.size() << " cache pools are missing hit_sets";
- auto& d = next.add("OSD_CACHE_NO_HIT_SET", HEALTH_WARN, ss.str());
- d.detail.swap(detail);
- }
- }
-
- // OSD_NO_SORTBITWISE
- if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE) &&
- (osdmap.get_up_osd_features() &
- CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
- ostringstream ss;
- ss << "no legacy OSD present but 'sortbitwise' flag is not set";
- next.add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
- }
-
- // OSD_NO_DOWN_OUT_INTERVAL
- {
- // Warn if 'mon_osd_down_out_interval' is set to zero.
- // Having this option set to zero on the leader acts much like the
- // 'noout' flag. It's hard to figure out what's going wrong with clusters
- // without the 'noout' flag set but acting like that just the same, so
- // we report a HEALTH_WARN in case this option is set to zero.
- // This is an ugly hack to get the warning out, but until we find a way
- // to spread global options throughout the mon cluster and have all mons
- // using a base set of the same options, we need to work around this sort
- // of things.
- // There's also the obvious drawback that if this is set on a single
- // monitor on a 3-monitor cluster, this warning will only be shown every
- // third monitor connection.
- if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
- g_conf->mon_osd_down_out_interval == 0) {
- ostringstream ss;
- ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
- auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
- d.detail.push_back("this has the same effect as the 'noout' flag");
- }
- }
-
- // OSD_UPGRADE_FINISHED
- // none of these (yet) since we don't run until luminous upgrade is done.
-
- // OSD_POOL_FULL
- for (auto it : osdmap.get_pools()) {
- list<string> detail;
- const pg_pool_t &pool = it.second;
- if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
- const string& pool_name = osdmap.get_pool_name(it.first);
- stringstream ss;
- ss << "pool '" << pool_name << "' is full";
- detail.push_back(ss.str());
- }
- if (!detail.empty()) {
- ostringstream ss;
- ss << detail.size() << " pool(s) full";
- auto& d = next.add("OSD_POOL_FULL", HEALTH_WARN, ss.str());
- d.detail.swap(detail);
- }
- }
-
+ tmp.check_health(&next);
encode_health(next, t);
}
FAST_READ_DEFAULT
};
- void _check_health(const OSDMap& next, MonitorDBStore::TransactionRef t);
-
// svc
public:
void create_initial() override;
*
*/
+#include <boost/algorithm/string.hpp>
+
#include "OSDMap.h"
#include <algorithm>
#include "common/config.h"
#include "include/str_map.h"
#include "common/code_environment.h"
+#include "mon/health_check.h"
#include "crush/CrushTreeDumper.h"
#include "common/Clock.h"
out << tbl << d.summary() << "\n";
}
}
+
+void OSDMap::check_health(health_check_map_t *checks) const
+{
+ int num_osds = get_num_osds();
+
+ // OSD_DOWN
+ // OSD_$subtree_DOWN
+ // OSD_ORPHAN
+ if (num_osds >= 0) {
+ int num_in_osds = 0;
+ int num_down_in_osds = 0;
+ set<int> osds;
+ set<int> down_in_osds;
+ set<int> up_in_osds;
+ set<int> subtree_up;
+ unordered_map<int, set<int> > subtree_type_down;
+ unordered_map<int, int> num_osds_subtree;
+ int max_type = crush->get_max_type_id();
+
+ for (int i = 0; i < get_max_osd(); i++) {
+ if (!exists(i)) {
+ if (crush->item_exists(i)) {
+ osds.insert(i);
+ }
+ continue;
+ }
+ if (is_out(i))
+ continue;
+ ++num_in_osds;
+ if (down_in_osds.count(i) || up_in_osds.count(i))
+ continue;
+ if (!is_up(i)) {
+ down_in_osds.insert(i);
+ int parent_id = 0;
+ int current = i;
+ for (int type = 0; type <= max_type; type++) {
+ if (!crush->get_type_name(type))
+ continue;
+ int r = crush->get_immediate_parent_id(current, &parent_id);
+ if (r == -ENOENT)
+ break;
+ // break early if this parent is already marked as up
+ if (subtree_up.count(parent_id))
+ break;
+ type = crush->get_bucket_type(parent_id);
+ if (!subtree_type_is_down(
+ g_ceph_context, parent_id, type,
+ &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+ break;
+ current = parent_id;
+ }
+ }
+ }
+
+ // calculate the number of down osds in each down subtree and
+ // store it in num_osds_subtree
+ for (int type = 1; type <= max_type; type++) {
+ if (!crush->get_type_name(type))
+ continue;
+ for (auto j = subtree_type_down[type].begin();
+ j != subtree_type_down[type].end();
+ ++j) {
+ if (type == 1) {
+ list<int> children;
+ int num = crush->get_children(*j, &children);
+ num_osds_subtree[*j] = num;
+ } else {
+ list<int> children;
+ int num = 0;
+ int num_children = crush->get_children(*j, &children);
+ if (num_children == 0)
+ continue;
+ for (auto l = children.begin(); l != children.end(); ++l) {
+ if (num_osds_subtree[*l] > 0) {
+ num = num + num_osds_subtree[*l];
+ }
+ }
+ num_osds_subtree[*j] = num;
+ }
+ }
+ }
+ num_down_in_osds = down_in_osds.size();
+ assert(num_down_in_osds <= num_in_osds);
+ if (num_down_in_osds > 0) {
+ // summary of down subtree types and osds
+ for (int type = max_type; type > 0; type--) {
+ if (!crush->get_type_name(type))
+ continue;
+ if (subtree_type_down[type].size() > 0) {
+ ostringstream ss;
+ ss << subtree_type_down[type].size() << " "
+ << crush->get_type_name(type);
+ if (subtree_type_down[type].size() > 1) {
+ ss << "s";
+ }
+ int sum_down_osds = 0;
+ for (auto j = subtree_type_down[type].begin();
+ j != subtree_type_down[type].end();
+ ++j) {
+ sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+ }
+ ss << " (" << sum_down_osds << " osds) down";
+ string err = string("OSD_") +
+ string(crush->get_type_name(type)) + "_DOWN";
+ boost::to_upper(err);
+ auto& d = checks->add(err, HEALTH_WARN, ss.str());
+ for (auto j = subtree_type_down[type].rbegin();
+ j != subtree_type_down[type].rend();
+ ++j) {
+ ostringstream ss;
+ ss << crush->get_type_name(type);
+ ss << " ";
+ ss << crush->get_item_name(*j);
+ // at the top level, do not print location
+ if (type != max_type) {
+ ss << " (";
+ ss << crush->get_full_location_ordered_string(*j);
+ ss << ")";
+ }
+ int num = num_osds_subtree[*j];
+ ss << " (" << num << " osds)";
+ ss << " is down";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+ ostringstream ss;
+ ss << down_in_osds.size() << " osds down";
+ auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
+ for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+ ostringstream ss;
+ ss << "osd." << *it << " (";
+ ss << crush->get_full_location_ordered_string(*it);
+ ss << ") is down";
+ d.detail.push_back(ss.str());
+ }
+ }
+
+ if (!osds.empty()) {
+ ostringstream ss;
+ ss << osds.size() << " osds exist in the crush map but not in the osdmap";
+ auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
+ for (auto osd : osds) {
+ ostringstream ss;
+ ss << "osd." << osd << " exists in crush map but not in osdmap";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+
+ // OSD_OUT_OF_ORDER_FULL
+ {
+ // An osd could configure failsafe ratio, to something different
+ // but for now assume it is the same here.
+ float fsr = g_conf->osd_failsafe_full_ratio;
+ if (fsr > 1.0) fsr /= 100;
+ float fr = get_full_ratio();
+ float br = get_backfillfull_ratio();
+ float nr = get_nearfull_ratio();
+
+ list<string> detail;
+ // These checks correspond to how OSDService::check_full_status() in an OSD
+ // handles the improper setting of these values.
+ if (br < nr) {
+ ostringstream ss;
+ ss << "backfillfull_ratio (" << br
+ << ") < nearfull_ratio (" << nr << "), increased";
+ detail.push_back(ss.str());
+ br = nr;
+ }
+ if (fr < br) {
+ ostringstream ss;
+ ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
+ << "), increased";
+ detail.push_back(ss.str());
+ fr = br;
+ }
+ if (fsr < fr) {
+ ostringstream ss;
+ ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
+ << "), increased";
+ detail.push_back(ss.str());
+ }
+ if (!detail.empty()) {
+ auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
+ "full ratio(s) out of order");
+ d.detail.swap(detail);
+ }
+ }
+
+ // OSD_FULL
+ // OSD_NEARFULL
+ // OSD_BACKFILLFULL
+ // OSD_FAILSAFE_FULL
+ {
+ set<int> full, backfillfull, nearfull;
+ get_full_osd_counts(&full, &backfillfull, &nearfull);
+ if (full.size()) {
+ ostringstream ss;
+ ss << full.size() << " full osd(s)";
+ auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
+ for (auto& i: full) {
+ ostringstream ss;
+ ss << "osd." << i << " is full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ if (backfillfull.size()) {
+ ostringstream ss;
+ ss << backfillfull.size() << " backfillfull osd(s)";
+ auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
+ for (auto& i: backfillfull) {
+ ostringstream ss;
+ ss << "osd." << i << " is backfill full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ if (nearfull.size()) {
+ ostringstream ss;
+ ss << nearfull.size() << " nearfull osd(s)";
+ auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
+ for (auto& i: nearfull) {
+ ostringstream ss;
+ ss << "osd." << i << " is near full";
+ d.detail.push_back(ss.str());
+ }
+ }
+ }
+
+ // OSD_FLAGS
+ {
+ // warn about flags
+ uint64_t warn_flags =
+ CEPH_OSDMAP_FULL |
+ CEPH_OSDMAP_PAUSERD |
+ CEPH_OSDMAP_PAUSEWR |
+ CEPH_OSDMAP_PAUSEREC |
+ CEPH_OSDMAP_NOUP |
+ CEPH_OSDMAP_NODOWN |
+ CEPH_OSDMAP_NOIN |
+ CEPH_OSDMAP_NOOUT |
+ CEPH_OSDMAP_NOBACKFILL |
+ CEPH_OSDMAP_NORECOVER |
+ CEPH_OSDMAP_NOSCRUB |
+ CEPH_OSDMAP_NODEEP_SCRUB |
+ CEPH_OSDMAP_NOTIERAGENT |
+ CEPH_OSDMAP_NOREBALANCE;
+ if (test_flag(warn_flags)) {
+ ostringstream ss;
+ ss << get_flag_string(get_flags() & warn_flags)
+ << " flag(s) set";
+ checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
+ }
+ }
+
+ // OLD_CRUSH_TUNABLES
+ if (g_conf->mon_warn_on_legacy_crush_tunables) {
+ string min = crush->get_min_required_version();
+ if (min < g_conf->mon_crush_min_required_version) {
+ ostringstream ss;
+ ss << "crush map has legacy tunables (require " << min
+ << ", min is " << g_conf->mon_crush_min_required_version << ")";
+ auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
+ d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
+ }
+ }
+
+ // OLD_CRUSH_STRAW_CALC_VERSION
+ if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
+ if (crush->get_straw_calc_version() == 0) {
+ ostringstream ss;
+ ss << "crush map has straw_calc_version=0";
+ auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
+ d.detail.push_back(
+ "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
+ }
+ }
+
+ // OSD_CACHE_NO_HIT_SET
+ if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
+ list<string> detail;
+ for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
+ p != pools.end();
+ ++p) {
+ const pg_pool_t& info = p->second;
+ if (info.cache_mode_requires_hit_set() &&
+ info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+ ostringstream ss;
+ ss << "pool '" << get_pool_name(p->first)
+ << "' with cache_mode " << info.get_cache_mode_name()
+ << " needs hit_set_type to be set but it is not";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " cache pools are missing hit_sets";
+ auto& d = checks->add("OSD_CACHE_NO_HIT_SET", HEALTH_WARN, ss.str());
+ d.detail.swap(detail);
+ }
+ }
+
+ // OSD_NO_SORTBITWISE
+ if (!test_flag(CEPH_OSDMAP_SORTBITWISE) &&
+ (get_up_osd_features() &
+ CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
+ ostringstream ss;
+ ss << "no legacy OSD present but 'sortbitwise' flag is not set";
+ checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
+ }
+
+ // OSD_UPGRADE_FINISHED
+ // none of these (yet) since we don't run until luminous upgrade is done.
+
+ // POOL_FULL
+ {
+ list<string> detail;
+ for (auto it : get_pools()) {
+ const pg_pool_t &pool = it.second;
+ if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+ const string& pool_name = get_pool_name(it.first);
+ stringstream ss;
+ ss << "pool '" << pool_name << "' is full";
+ detail.push_back(ss.str());
+ }
+ }
+ if (!detail.empty()) {
+ ostringstream ss;
+ ss << detail.size() << " pool(s) full";
+ auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
+ d.detail.swap(detail);
+ }
+ }
+}
// forward declaration
class CephContext;
class CrushWrapper;
+class health_check_map_t;
// FIXME C++11 does not have std::equal for two differently-typed containers.
// use this until we move to c++14
void dump(Formatter *f) const;
static void generate_test_instances(list<OSDMap*>& o);
bool check_new_blacklist_entries() const { return new_blacklist_entries; }
+
+ void check_health(health_check_map_t *checks) const;
};
WRITE_CLASS_ENCODER_FEATURES(OSDMap)
WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)