OPTION(mon_pg_create_interval, 0, OPT_FLOAT, 30.0), // no more than every 30s
OPTION(mon_clientid_prealloc, 0, OPT_INT, 100), // how many clientids to prealloc
OPTION(mon_globalid_prealloc, 0, OPT_INT, 100), // how many globalids to prealloc
+ OPTION(mon_osd_report_timeout, 0, OPT_INT, 900), // grace period before declaring unresponsive OSDs dead
OPTION(paxos_propose_interval, 0, OPT_DOUBLE, 1.0), // gather updates for this long before proposing a map update
OPTION(paxos_min_wait, 0, OPT_DOUBLE, 0.05), // min time to gather updates for after period of inactivity
OPTION(paxos_observer_timeout, 0, OPT_DOUBLE, 5*60), // gather updates for this long before proposing a map update
float mon_pg_create_interval;
int mon_clientid_prealloc;
int mon_globalid_prealloc;
+ int mon_osd_report_timeout;
double paxos_propose_interval;
double paxos_min_wait;
}
+void OSDMonitor::handle_osd_timeouts(const utime_t &now,
+ const std::map<int,utime_t> &last_osd_report)
+{
+ utime_t timeo(g_conf.mon_osd_report_timeout, 0);
+ int max_osd = osdmap.get_max_osd();
+ bool new_down = false;
+
+ for (int i=0; i < max_osd; ++i) {
+ dout(30) << "handle_osd_timeouts: checking up on osd " << i << dendl;
+ if (!osdmap.exists(i))
+ continue;
+ if (!osdmap.is_up(i))
+ continue;
+ const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
+ if (t == last_osd_report.end()) {
+ derr << "OSDMonitor::handle_osd_timeouts: never got MOSDPGStat "
+ << "info from osd " << i << ". Marking down!" << dendl;
+ pending_inc.new_down[i] = true;
+ new_down = true;
+ }
+ else {
+ utime_t diff(now);
+ diff -= t->second;
+ if (diff > timeo) {
+ derr << "OSDMonitor::handle_osd_timeouts: last got MOSDPGStat "
+ << "info from osd " << i << " at " << t->second << ". It has "
+ << "been " << diff << ", so we're marking it down!" << dendl;
+ pending_inc.new_down[i] = true;
+ new_down = true;
+ }
+ }
+ }
+ if (new_down) {
+ propose_pending();
+ }
+}
void OSDMonitor::mark_all_down()
{
bool preprocess_command(MMonCommand *m);
bool prepare_command(MMonCommand *m);
+ void handle_osd_timeouts(const utime_t &now,
+ const std::map<int,utime_t> &last_osd_report);
void mark_all_down();
void send_latest(PaxosServiceMessage *m, epoch_t start=0);
<< ".pg v" << pg_map.version << " ";
}
-
/*
Tick function to update the map based on performance every N seconds
*/
+void PGMonitor::on_election_start()
+{
+ // clear leader state
+ last_sent_pg_create.clear();
+ last_osd_report.clear();
+}
+
void PGMonitor::tick()
{
if (!paxos->is_active()) return;
update_from_paxos();
+ handle_osd_timeouts();
dout(10) << pg_map << dendl;
}
}
// walk through incrementals
+ utime_t now(g_clock.now());
while (paxosv > pg_map.version) {
bufferlist bl;
bool success = paxos->read(pg_map.version+1, bl);
return true;
}
+void PGMonitor::handle_osd_timeouts()
+{
+ if (!mon->is_leader())
+ return;
+ utime_t now(g_clock.now());
+ utime_t timeo(g_conf.mon_osd_report_timeout, 0);
+ if (now - mon->get_leader_since() < timeo) {
+ // We haven't been the leader for long enough to consider OSD timeouts
+ return;
+ }
+
+ mon->osdmon()->handle_osd_timeouts(now, last_osd_report);
+}
+
void PGMonitor::create_pending()
{
pending_inc = PGMap::Incremental();
return false;
}
+bool PGMonitor::pg_stats_have_changed(int from, const MPGStats *stats) const
+{
+ // any new osd info?
+ hash_map<int,osd_stat_t>::const_iterator s = pg_map.osd_stat.find(from);
+ if (s == pg_map.osd_stat.end())
+ return true;
+ if (s->second != stats->osd_stat)
+ return true;
+
+ // any new pg info?
+ for (map<pg_t,pg_stat_t>::const_iterator p = stats->pg_stat.begin();
+ p != stats->pg_stat.end(); ++p) {
+ hash_map<pg_t,pg_stat_t>::const_iterator t = pg_map.pg_stat.find(p->first);
+ if (t == pg_map.pg_stat.end())
+ return true;
+ if (t->second.reported != p->second.reported)
+ return true;
+ }
+
+ return false;
+}
+
bool PGMonitor::prepare_pg_stats(MPGStats *stats)
{
dout(10) << "prepare_pg_stats " << *stats << " from " << stats->get_orig_source() << dendl;
stats->put();
return false;
}
+
+ last_osd_report[from] = g_clock.now();
+
if (!stats->get_orig_source().is_osd() ||
!mon->osdmon()->osdmap.is_up(from) ||
stats->get_orig_source_inst() != mon->osdmon()->osdmap.get_inst(from)) {
return false;
}
+ if (!pg_stats_have_changed(from, stats)) {
+ dout(10) << " message contains no new osd|pg stats" << dendl;
+ MPGStatsAck *ack = new MPGStatsAck;
+ for (map<pg_t,pg_stat_t>::const_iterator p = stats->pg_stat.begin();
+ p != stats->pg_stat.end();
+ ++p) {
+ ack->pg_stat[p->first] = p->second.reported;
+ }
+ mon->send_reply(stats, ack);
+ stats->put();
+ return false;
+ }
+
// osd stat
pending_inc.osd_stat_updates[from] = stats->osd_stat;
#include <set>
using namespace std;
+#include "PGMap.h"
+#include "PaxosService.h"
#include "include/types.h"
+#include "include/utime.h"
#include "msg/Messenger.h"
-#include "PaxosService.h"
-
-#include "PGMap.h"
class MPGStats;
class MPGStatsAck;
void create_initial(bufferlist& bl);
bool update_from_paxos();
+ void handle_osd_timeouts();
void create_pending(); // prepare a new pending
void encode_pending(bufferlist &bl); // propose pending update to peers
bool prepare_update(PaxosServiceMessage *m);
bool preprocess_pg_stats(MPGStats *stats);
+ bool pg_stats_have_changed(int from, const MPGStats *stats) const;
bool prepare_pg_stats(MPGStats *stats);
void _updated_stats(MPGStats *req, MPGStatsAck *ack);
map<int,utime_t> last_sent_pg_create; // per osd throttle
+ // when we last received PG stats from each osd
+ map<int,utime_t> last_osd_report;
+
void register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool);
bool register_new_pgs();
void send_pg_creates();
public:
PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { }
- void tick(); // check state, take actions
+ virtual void on_election_start();
+ void tick(); // check state, take actions
void check_osd_map(epoch_t epoch);
-
};
#endif