From: Greg Farnum Date: Wed, 1 Feb 2012 21:25:37 +0000 (-0800) Subject: osd: add check_ops_in_flight() X-Git-Tag: v0.44~99^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=71c59dae27b0044a28feed171670cfefd2ccbcf1;p=ceph.git osd: add check_ops_in_flight() By default it warns on requests that are more than 30 seconds old, using an exponential backoff of that interval. Also add state name retrieval to OpRequest. Signed-off-by: Greg Farnum --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 4ea9207f1c1d..42fb805f1078 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -295,6 +295,7 @@ OPTION(osd_rollback_to_cluster_snap, OPT_STR, "") OPTION(osd_max_notify_timeout, OPT_U32, 30) // max notify timeout in seconds OPTION(osd_kill_backfill_at, OPT_INT, 0) OPTION(osd_min_pg_log_entries, OPT_U32, 1000) // number of entries to keep in the pg log when trimming it +OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy OPTION(filestore, OPT_BOOL, false) OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index ae39befafea4..515703ab4c47 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1819,6 +1819,36 @@ void OSD::tick() dispatch_running = false; dispatch_cond.Signal(); } + + check_ops_in_flight(); +} + +void OSD::check_ops_in_flight() +{ + ops_in_flight_lock.Lock(); + if (ops_in_flight.size()) { + utime_t now = ceph_clock_now(g_ceph_context); + utime_t too_old = now; + too_old -= g_conf->osd_op_complaint_time; + dout(1) << "ops_in_flight.size: " << ops_in_flight.size() + << "; oldest is " << now - ops_in_flight.front()->received_time + << " seconds old" << dendl; + xlist::iterator i = ops_in_flight.begin(); + while (!i.end() && (*i)->received_time < too_old) { + // exponential backoff of warning intervals + if ( ( (*i)->received_time + + (g_conf->osd_op_complaint_time * + (*i)->warn_interval_multiplier) )< now) { + stringstream ss; + ss << "old request " << *((*i)->request) << " received at " + << (*i)->received_time << " currently " << (*i)->state_string(); + clog.warn(ss); + ++(*i)->warn_interval_multiplier; + } + ++i; + } + } + ops_in_flight_lock.Unlock(); } void OSD::register_inflight_op(xlist::item *i) diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h index 9bcc9698d6d8..839400d86679 100644 --- a/src/osd/OpRequest.h +++ b/src/osd/OpRequest.h @@ -25,6 +25,7 @@ struct OpRequest : public RefCountedObject { Message *request; xlist::item xitem; utime_t received_time; + uint8_t warn_interval_multiplier; private: OSD *osd; uint8_t hit_flag_points; @@ -37,7 +38,9 @@ private: public: OpRequest() : request(NULL), xitem(this) {} - OpRequest(Message *req, OSD *o) : request(req), xitem(this), osd(o) { + OpRequest(Message *req, OSD *o) : request(req), xitem(this), + warn_interval_multiplier(1), + osd(o) { received_time = request->get_recv_stamp(); } ~OpRequest() { @@ -58,6 +61,18 @@ public: bool currently_started() { return hit_flag_points & flag_started; } bool currently_sub_op_sent() { return hit_flag_points & flag_sub_op_sent; } + const char *state_string() { + switch(latest_flag_point) { + case flag_queued_for_pg: return "queued for pg"; + case flag_reached_pg: return "reached pg"; + case flag_delayed: return "delayed"; + case flag_started: return "started"; + case flag_sub_op_sent: return "waiting for sub ops"; + default: break; + } + return "no flag points reached"; + } + void mark_queued_for_pg() { hit_flag_points |= flag_queued_for_pg; latest_flag_point = flag_queued_for_pg;