osd: add a monitor timeout via MPGStatsAck messages

author Greg Farnum <gregory.farnum@dreamhost.com>

Sat, 24 Dec 2011 00:41:38 +0000 (16:41 -0800)

committer Sage Weil <sage.weil@dreamhost.com>

Tue, 3 Jan 2012 19:37:31 +0000 (11:37 -0800)
author Greg Farnum <gregory.farnum@dreamhost.com>
Sat, 24 Dec 2011 00:41:38 +0000 (16:41 -0800)
committer Sage Weil <sage.weil@dreamhost.com>
Tue, 3 Jan 2012 19:37:31 +0000 (11:37 -0800)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 8a62da8ae148d6ce05a77b23a165d6b7b5593aa6..6070ea210275a6c1b59eea162ff9b28b419bca87 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -267,6 +267,7 @@ OPTION(osd_mon_heartbeat_interval, OPT_INT, 30)  // if no peers, ping monitor
  OPTION(osd_heartbeat_grace, OPT_INT, 20)
  OPTION(osd_mon_report_interval_max, OPT_INT, 120)
  OPTION(osd_mon_report_interval_min, OPT_INT, 5)  // pg stats, failures, up_thru, boot.
+OPTION(osd_mon_ack_timeout, OPT_INT, 30) // time out a mon if it doesn't ack stats
  OPTION(osd_min_down_reporters, OPT_INT, 1)   // number of OSDs who need to report a down OSD for it to count
  OPTION(osd_min_down_reports, OPT_INT, 3)     // number of times a down OSD must be reported for it to count
  OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index f80eefc67ae40c89b1de9cb1a065eb1ef670693e..5ec83e4478c10f0434a993dec56f2b9ed7f8a43e 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -545,6 +545,7 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
    map_lock("OSD::map_lock"),
    peer_map_epoch_lock("OSD::peer_map_epoch_lock"),
    map_cache_lock("OSD::map_cache_lock"),
+  outstanding_pg_stats(false),
    up_thru_wanted(0), up_thru_pending(0),
    pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
    osd_stat_updated(false),
@@ -1802,6 +1803,13 @@ void OSD::tick()
  
    timer.add_event_after(1.0, new C_Tick(this));
  
+  if (outstanding_pg_stats
+      &&(now - g_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
+    dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack
+            << "seconds, reconnecting elsewhere" << dendl;
+    monc->reopen_session();
+  }
+
    // only do waiters if dispatch() isn't currently running.  (if it is,
    // it'll do the waiters, and doing them here may screw up ordering
    // of op_queue vs handle_osd_map.)
@@ -2152,7 +2160,11 @@ void OSD::send_pg_stats(const utime_t &now)
        }
        pg->pg_stats_lock.Unlock();
      }
-    
+
+    if (!outstanding_pg_stats) {
+      outstanding_pg_stats = true;
+      last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+    }
      monc->send_mon_message(m);
    }
  
@@ -2168,6 +2180,8 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
      return;
    }
  
+  last_pg_stats_ack = ceph_clock_now(g_ceph_context);
+
    pg_stat_queue_lock.Lock();
  
    if (ack->get_tid() > pg_stat_tid_flushed) {
@@ -2196,6 +2210,10 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
        dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_stable.reported << dendl;
    }
    
+  if (!pg_stat_queue.size()) {
+    outstanding_pg_stats = false;
+  }
+
    pg_stat_queue_lock.Unlock();
  
    ack->put();
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 72296e10b6cc8439ed6497308f90a0b5d65b67ad..ad6e8a335c40d79d1b5b59fa634eaa10b60aecda 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -479,6 +479,15 @@ protected:
    utime_t last_mon_report;
    utime_t last_pg_stats_sent;
  
+  /* if our monitor dies, we want to notice it and reconnect.
+   *  So we keep track of when it last acked our stat updates,
+   *  and if too much time passes (and we've been sending
+   *  more updates) then we can call it dead and reconnect
+   *  elsewhere.
+   */
+  utime_t last_pg_stats_ack;
+  bool outstanding_pg_stats; // some stat updates haven't been acked yet
+
    void do_mon_report();
  
    // -- boot --
author	Greg Farnum <gregory.farnum@dreamhost.com>
	Sat, 24 Dec 2011 00:41:38 +0000 (16:41 -0800)
committer	Sage Weil <sage.weil@dreamhost.com>
	Tue, 3 Jan 2012 19:37:31 +0000 (11:37 -0800)
src/common/config_opts.h		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history