From 494b812ca138bb3a0673a9a0635bd124c24e85f3 Mon Sep 17 00:00:00 2001 From: Colin Patrick McCabe Date: Thu, 6 Jan 2011 13:33:07 -0800 Subject: [PATCH] mon: mark osds down for not sending MOSDPGStat PGMonitor::prepare_pg_stats should check to see if the stats in the MOSDPgStats message are the same as the ones we already have. If so, no need to create an incremental; just send an ACK and return false. The leading Monitor now marks osds as down if they haven't sent a MOSDPGStat message in the last 15 minutes. Signed-off-by: Colin McCabe --- src/config.cc | 1 + src/config.h | 1 + src/mon/OSDMonitor.cc | 36 +++++++++++++++++++++++++ src/mon/OSDMonitor.h | 2 ++ src/mon/PGMonitor.cc | 62 ++++++++++++++++++++++++++++++++++++++++++- src/mon/PGMonitor.h | 15 +++++++---- 6 files changed, 111 insertions(+), 6 deletions(-) diff --git a/src/config.cc b/src/config.cc index 6e85833f93056..791935a990ca0 100644 --- a/src/config.cc +++ b/src/config.cc @@ -403,6 +403,7 @@ static struct config_option config_optionsp[] = { OPTION(mon_pg_create_interval, 0, OPT_FLOAT, 30.0), // no more than every 30s OPTION(mon_clientid_prealloc, 0, OPT_INT, 100), // how many clientids to prealloc OPTION(mon_globalid_prealloc, 0, OPT_INT, 100), // how many globalids to prealloc + OPTION(mon_osd_report_timeout, 0, OPT_INT, 900), // grace period before declaring unresponsive OSDs dead OPTION(paxos_propose_interval, 0, OPT_DOUBLE, 1.0), // gather updates for this long before proposing a map update OPTION(paxos_min_wait, 0, OPT_DOUBLE, 0.05), // min time to gather updates for after period of inactivity OPTION(paxos_observer_timeout, 0, OPT_DOUBLE, 5*60), // gather updates for this long before proposing a map update diff --git a/src/config.h b/src/config.h index ced6c4db234f1..93b454f8d120b 100644 --- a/src/config.h +++ b/src/config.h @@ -191,6 +191,7 @@ struct md_config_t { float mon_pg_create_interval; int mon_clientid_prealloc; int mon_globalid_prealloc; + int mon_osd_report_timeout; double paxos_propose_interval; double paxos_min_wait; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 3180697bc508b..e2ea93c2373a8 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -989,6 +989,42 @@ void OSDMonitor::tick() } +void OSDMonitor::handle_osd_timeouts(const utime_t &now, + const std::map &last_osd_report) +{ + utime_t timeo(g_conf.mon_osd_report_timeout, 0); + int max_osd = osdmap.get_max_osd(); + bool new_down = false; + + for (int i=0; i < max_osd; ++i) { + dout(30) << "handle_osd_timeouts: checking up on osd " << i << dendl; + if (!osdmap.exists(i)) + continue; + if (!osdmap.is_up(i)) + continue; + const std::map::const_iterator t = last_osd_report.find(i); + if (t == last_osd_report.end()) { + derr << "OSDMonitor::handle_osd_timeouts: never got MOSDPGStat " + << "info from osd " << i << ". Marking down!" << dendl; + pending_inc.new_down[i] = true; + new_down = true; + } + else { + utime_t diff(now); + diff -= t->second; + if (diff > timeo) { + derr << "OSDMonitor::handle_osd_timeouts: last got MOSDPGStat " + << "info from osd " << i << " at " << t->second << ". It has " + << "been " << diff << ", so we're marking it down!" << dendl; + pending_inc.new_down[i] = true; + new_down = true; + } + } + } + if (new_down) { + propose_pending(); + } +} void OSDMonitor::mark_all_down() { diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 1c150013ac20e..69ebbd01cc39f 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -162,6 +162,8 @@ private: bool preprocess_command(MMonCommand *m); bool prepare_command(MMonCommand *m); + void handle_osd_timeouts(const utime_t &now, + const std::map &last_osd_report); void mark_all_down(); void send_latest(PaxosServiceMessage *m, epoch_t start=0); diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index ad7afe35df10e..0709e3754ab93 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -47,16 +47,23 @@ static ostream& _prefix(Monitor *mon, PGMap& pg_map) { << ".pg v" << pg_map.version << " "; } - /* Tick function to update the map based on performance every N seconds */ +void PGMonitor::on_election_start() +{ + // clear leader state + last_sent_pg_create.clear(); + last_osd_report.clear(); +} + void PGMonitor::tick() { if (!paxos->is_active()) return; update_from_paxos(); + handle_osd_timeouts(); dout(10) << pg_map << dendl; } @@ -92,6 +99,7 @@ bool PGMonitor::update_from_paxos() } // walk through incrementals + utime_t now(g_clock.now()); while (paxosv > pg_map.version) { bufferlist bl; bool success = paxos->read(pg_map.version+1, bl); @@ -143,6 +151,20 @@ bool PGMonitor::update_from_paxos() return true; } +void PGMonitor::handle_osd_timeouts() +{ + if (!mon->is_leader()) + return; + utime_t now(g_clock.now()); + utime_t timeo(g_conf.mon_osd_report_timeout, 0); + if (now - mon->get_leader_since() < timeo) { + // We haven't been the leader for long enough to consider OSD timeouts + return; + } + + mon->osdmon()->handle_osd_timeouts(now, last_osd_report); +} + void PGMonitor::create_pending() { pending_inc = PGMap::Incremental(); @@ -306,6 +328,28 @@ bool PGMonitor::preprocess_pg_stats(MPGStats *stats) return false; } +bool PGMonitor::pg_stats_have_changed(int from, const MPGStats *stats) const +{ + // any new osd info? + hash_map::const_iterator s = pg_map.osd_stat.find(from); + if (s == pg_map.osd_stat.end()) + return true; + if (s->second != stats->osd_stat) + return true; + + // any new pg info? + for (map::const_iterator p = stats->pg_stat.begin(); + p != stats->pg_stat.end(); ++p) { + hash_map::const_iterator t = pg_map.pg_stat.find(p->first); + if (t == pg_map.pg_stat.end()) + return true; + if (t->second.reported != p->second.reported) + return true; + } + + return false; +} + bool PGMonitor::prepare_pg_stats(MPGStats *stats) { dout(10) << "prepare_pg_stats " << *stats << " from " << stats->get_orig_source() << dendl; @@ -316,6 +360,9 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats) stats->put(); return false; } + + last_osd_report[from] = g_clock.now(); + if (!stats->get_orig_source().is_osd() || !mon->osdmon()->osdmap.is_up(from) || stats->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) { @@ -324,6 +371,19 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats) return false; } + if (!pg_stats_have_changed(from, stats)) { + dout(10) << " message contains no new osd|pg stats" << dendl; + MPGStatsAck *ack = new MPGStatsAck; + for (map::const_iterator p = stats->pg_stat.begin(); + p != stats->pg_stat.end(); + ++p) { + ack->pg_stat[p->first] = p->second.reported; + } + mon->send_reply(stats, ack); + stats->put(); + return false; + } + // osd stat pending_inc.osd_stat_updates[from] = stats->osd_stat; diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 797c0273729d9..2e03b43517cb4 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -24,11 +24,11 @@ #include using namespace std; +#include "PGMap.h" +#include "PaxosService.h" #include "include/types.h" +#include "include/utime.h" #include "msg/Messenger.h" -#include "PaxosService.h" - -#include "PGMap.h" class MPGStats; class MPGStatsAck; @@ -45,6 +45,7 @@ private: void create_initial(bufferlist& bl); bool update_from_paxos(); + void handle_osd_timeouts(); void create_pending(); // prepare a new pending void encode_pending(bufferlist &bl); // propose pending update to peers @@ -54,6 +55,7 @@ private: bool prepare_update(PaxosServiceMessage *m); bool preprocess_pg_stats(MPGStats *stats); + bool pg_stats_have_changed(int from, const MPGStats *stats) const; bool prepare_pg_stats(MPGStats *stats); void _updated_stats(MPGStats *req, MPGStatsAck *ack); @@ -76,6 +78,9 @@ private: map last_sent_pg_create; // per osd throttle + // when we last received PG stats from each osd + map last_osd_report; + void register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool); bool register_new_pgs(); void send_pg_creates(); @@ -83,11 +88,11 @@ private: public: PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - void tick(); // check state, take actions + virtual void on_election_start(); + void tick(); // check state, take actions void check_osd_map(epoch_t epoch); - }; #endif -- 2.39.5