]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: add check_ops_in_flight()
authorGreg Farnum <gregory.farnum@dreamhost.com>
Wed, 1 Feb 2012 21:25:37 +0000 (13:25 -0800)
committerGreg Farnum <gregory.farnum@dreamhost.com>
Wed, 1 Feb 2012 23:54:04 +0000 (15:54 -0800)
By default it warns on requests that are more than 30 seconds old,
using an exponential backoff of that interval.
Also add state name retrieval to OpRequest.

Signed-off-by: Greg Farnum <gregory.farnum@dreamhost.com>
src/common/config_opts.h
src/osd/OSD.cc
src/osd/OpRequest.h

index 4ea9207f1c1db6202795cfbc82794f3b5a4215f9..42fb805f1078d5b2cdb73ad8feedcad6cc9b78ce 100644 (file)
@@ -295,6 +295,7 @@ OPTION(osd_rollback_to_cluster_snap, OPT_STR, "")
 OPTION(osd_max_notify_timeout, OPT_U32, 30) // max notify timeout in seconds
 OPTION(osd_kill_backfill_at, OPT_INT, 0)
 OPTION(osd_min_pg_log_entries, OPT_U32, 1000) // number of entries to keep in the pg log when trimming it
+OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy
 OPTION(filestore, OPT_BOOL, false)
 OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5)    // seconds
 OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01)  // seconds
index ae39befafea4725dfce56593f6ff313807862fca..515703ab4c47624248919555fdfce170478041f2 100644 (file)
@@ -1819,6 +1819,36 @@ void OSD::tick()
     dispatch_running = false;
     dispatch_cond.Signal();
   }
+
+  check_ops_in_flight();
+}
+
+void OSD::check_ops_in_flight()
+{
+  ops_in_flight_lock.Lock();
+  if (ops_in_flight.size()) {
+    utime_t now = ceph_clock_now(g_ceph_context);
+    utime_t too_old = now;
+    too_old -= g_conf->osd_op_complaint_time;
+    dout(1) << "ops_in_flight.size: " << ops_in_flight.size()
+              << "; oldest is " << now - ops_in_flight.front()->received_time
+              << " seconds old" << dendl;
+    xlist<OpRequest*>::iterator i = ops_in_flight.begin();
+    while (!i.end() && (*i)->received_time < too_old) {
+      // exponential backoff of warning intervals
+      if ( ( (*i)->received_time +
+            (g_conf->osd_op_complaint_time *
+              (*i)->warn_interval_multiplier) )< now) {
+        stringstream ss;
+        ss << "old request " << *((*i)->request) << " received at "
+           << (*i)->received_time << " currently " << (*i)->state_string();
+        clog.warn(ss);
+        ++(*i)->warn_interval_multiplier;
+      }
+      ++i;
+    }
+  }
+  ops_in_flight_lock.Unlock();
 }
 
 void OSD::register_inflight_op(xlist<OpRequest*>::item *i)
index 9bcc9698d6d86993daa6ec506193f4efa21bdfd5..839400d8667930460fa0c773ee4ea464233ede29 100644 (file)
@@ -25,6 +25,7 @@ struct OpRequest : public RefCountedObject {
   Message *request;
   xlist<OpRequest*>::item xitem;
   utime_t received_time;
+  uint8_t warn_interval_multiplier;
 private:
   OSD *osd;
   uint8_t hit_flag_points;
@@ -37,7 +38,9 @@ private:
 
 public:
   OpRequest() : request(NULL), xitem(this) {}
-  OpRequest(Message *req, OSD *o) : request(req), xitem(this), osd(o) {
+  OpRequest(Message *req, OSD *o) : request(req), xitem(this),
+      warn_interval_multiplier(1),
+      osd(o) {
     received_time = request->get_recv_stamp();
   }
   ~OpRequest() {
@@ -58,6 +61,18 @@ public:
   bool currently_started() { return hit_flag_points & flag_started; }
   bool currently_sub_op_sent() { return hit_flag_points & flag_sub_op_sent; }
 
+  const char *state_string() {
+    switch(latest_flag_point) {
+    case flag_queued_for_pg: return "queued for pg";
+    case flag_reached_pg: return "reached pg";
+    case flag_delayed: return "delayed";
+    case flag_started: return "started";
+    case flag_sub_op_sent: return "waiting for sub ops";
+    default: break;
+    }
+    return "no flag points reached";
+  }
+
   void mark_queued_for_pg() {
     hit_flag_points |= flag_queued_for_pg;
     latest_flag_point = flag_queued_for_pg;