From: Josh Durgin Date: Wed, 15 Feb 2012 01:52:36 +0000 (-0800) Subject: mon: add dump_stuck command X-Git-Tag: v0.43~43^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c08615e6e0f30b8f3a43e6c28b7b04f72dfb5023;p=ceph.git mon: add dump_stuck command This will help monitoring transient pg states at a coarse level. Fixes: #2005 Signed-off-by: Josh Durgin --- diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 5baa793f948..50a6d0072f7 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -489,6 +489,60 @@ void PGMap::dump(ostream& ss) const << std::endl; } +void PGMap::get_stuck_stats(PGMap::StuckPG type, utime_t cutoff, + hash_map& stuck_pgs) const +{ + for (hash_map::const_iterator i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + utime_t val; + switch (type) { + case STUCK_INACTIVE: + if (i->second.state & PG_STATE_ACTIVE) + continue; + val = i->second.last_active; + break; + case STUCK_UNCLEAN: + if (i->second.state & PG_STATE_CLEAN) + continue; + val = i->second.last_clean; + break; + case STUCK_STALE: + val = i->second.last_fresh; + break; + default: + assert(0 == "invalid type"); + } + + if (val < cutoff) { + stuck_pgs[i->first] = i->second; + } + } +} + +void PGMap::dump_stuck(Formatter *f, PGMap::StuckPG type, utime_t cutoff) const +{ + hash_map stuck_pg_stats; + get_stuck_stats(type, cutoff, stuck_pg_stats); + f->open_array_section("stuck_pg_stats"); + for (hash_map::const_iterator i = stuck_pg_stats.begin(); + i != stuck_pg_stats.end(); + ++i) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << i->first; + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) const +{ + hash_map stuck_pg_stats; + get_stuck_stats(type, cutoff, stuck_pg_stats); + dump_pg_stats_plain(ss, stuck_pg_stats); +} + void PGMap::state_summary(ostream& ss) const { for (hash_map::const_iterator p = num_pg_by_state.begin(); diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 1fe03485307..90df77db8a8 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -71,6 +71,13 @@ public: osd_stat_t osd_sum; set creating_pgs; // lru: front = new additions, back = recently pinged + + enum StuckPG { + STUCK_INACTIVE, + STUCK_UNCLEAN, + STUCK_STALE, + STUCK_NONE + }; PGMap() : version(0), @@ -99,6 +106,11 @@ public: void dump_pg_stats_plain(ostream& ss, const hash_map& pg_stats) const; + void get_stuck_stats(StuckPG type, utime_t cutoff, + hash_map& stuck_pgs) const; + void dump_stuck(Formatter *f, StuckPG type, utime_t cutoff) const; + void dump_stuck_plain(ostream& ss, StuckPG type, utime_t cutoff) const; + void dump(ostream& ss) const; void state_summary(ostream& ss) const; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 6016e9e8a4d..ce64eb09989 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -981,6 +981,9 @@ bool PGMonitor::preprocess_command(MMonCommand *m) jsf.flush(ds); rdata.append(ds); } + else if (m->cmd[1] == "dump_stuck") { + r = dump_stuck_pg_stats(ss, rdata, args); + } else if (m->cmd[1] == "dump_pools_json") { ss << "ok"; r = 0; @@ -1182,3 +1185,84 @@ enum health_status_t PGMonitor::get_health(std::ostream &ss) const return ret; } + +int PGMonitor::dump_stuck_pg_stats(ostream& ss, + bufferlist& rdata, + vector& args) const +{ + string format = "plain"; + string val; + int threshold = 300; + int seconds; + ostringstream err; + + if (args.size() < 2) { + ss << "Must specify inactive or unclean or stale."; + return -EINVAL; + } + + PGMap::StuckPG stuck_type = PGMap::STUCK_NONE; + string type = args[1]; + if (type == "inactive") + stuck_type = PGMap::STUCK_INACTIVE; + if (type == "unclean") + stuck_type = PGMap::STUCK_UNCLEAN; + if (type == "stale") + stuck_type = PGMap::STUCK_STALE; + if (stuck_type == PGMap::STUCK_NONE) { + ss << "Invalid stuck type '" << type + << "'. Valid types are: inactive, unclean, or stale"; + return -EINVAL; + } + + for (std::vector::iterator i = args.begin() + 2; + i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } else if (ceph_argparse_witharg(args, i, &val, + "-f", "--format", (char*)NULL)) { + if (val != "json" && val != "plain") { + ss << "format must be json or plain"; + return -EINVAL; + } + format = val; + } else if (ceph_argparse_withint(args, i, &seconds, &err, + "-t", "--threshold", (char*)NULL)) { + if (!err.str().empty()) { + ss << err.str(); + return -EINVAL; + } + threshold = seconds; + } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) { + stringstream ds; + ds << "Usage: ceph pg dump_stuck inactive|unclean|stale [options]" << std::endl + << std::endl + << "Get stats for pgs that have not been active, clean, or refreshed in some number of seconds." << std::endl + << std::endl + << "Options: " << std::endl + << " -h, --help display usage info" << std::endl + << " -f, --format [plain|json] output format (default: plain)" << std::endl + << " -t, --threshold [seconds] how many seconds 'stuck' is (default: 300)" << std::endl; + rdata.append(ds); + return 0; + } else { + ss << "invalid argument '" << *i << "'"; + return -EINVAL; + } + } + + utime_t now(ceph_clock_now(g_ceph_context)); + utime_t cutoff = now - utime_t(threshold, 0); + + stringstream ds; + if (format == "json") { + JSONFormatter jsf(true); + pg_map.dump_stuck(&jsf, stuck_type, cutoff); + jsf.flush(ds); + } else { + pg_map.dump_stuck_plain(ds, stuck_type, cutoff); + } + rdata.append(ds); + ss << "ok"; + return 0; +} diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 42ba9c4dd88..3dbbfc64c12 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -112,6 +112,15 @@ private: */ bool check_down_pgs(); + /** + * Dump stats from pgs stuck in specified states. + * + * @return 0 on success, negative error code on failure + */ + int dump_stuck_pg_stats(ostream& ss, + bufferlist& rdata, + vector& args) const; + public: PGMonitor(Monitor *mn, Paxos *p); virtual ~PGMonitor();