]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Add perf tracking for all states in RecoveryState
authorDavid Zafman <david.zafman@inktank.com>
Thu, 15 Aug 2013 19:28:06 +0000 (12:28 -0700)
committerDavid Zafman <david.zafman@inktank.com>
Fri, 16 Aug 2013 01:29:35 +0000 (18:29 -0700)
Fixes: #5848
Signed-off-by: David Zafman <david.zafman@inktank.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc

index 14bdc87487ce7420c2e6b2c50c9a9ed00a026e49..55f11707189b13ff286dad09d077c447db59711f 100644 (file)
@@ -161,6 +161,7 @@ OSDService::OSDService(OSD *osd) :
   cluster_messenger(osd->cluster_messenger),
   client_messenger(osd->client_messenger),
   logger(osd->logger),
+  recoverystate_perf(osd->recoverystate_perf),
   monc(osd->monc),
   op_wq(osd->op_wq),
   peering_wq(osd->peering_wq),
@@ -898,6 +899,7 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger,
   client_messenger(external_messenger),
   monc(mc),
   logger(NULL),
+  recoverystate_perf(NULL),
   store(NULL),
   clog(external_messenger->cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
   whoami(id),
@@ -953,7 +955,9 @@ OSD::~OSD()
   delete authorize_handler_cluster_registry;
   delete authorize_handler_service_registry;
   delete class_handler;
+  g_ceph_context->get_perfcounters_collection()->remove(recoverystate_perf);
   g_ceph_context->get_perfcounters_collection()->remove(logger);
+  delete recoverystate_perf;
   delete logger;
   delete store;
 }
@@ -1183,6 +1187,8 @@ int OSD::init()
   osdmap = get_map(superblock.current_epoch);
   check_osdmap_features();
 
+  create_recoverystate_perf();
+
   bind_epoch = osdmap->get_epoch();
 
   // load up pgs (as they previously existed)
@@ -1416,12 +1422,51 @@ void OSD::create_logger()
   osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups"); // dup osdmap epochs
   osd_plb.add_u64_counter(l_osd_waiting_for_map,
                          "messages_delayed_for_map"); // dup osdmap epochs
-  osd_plb.add_time_avg(l_osd_peering_latency, "peering_latency");
 
   logger = osd_plb.create_perf_counters();
   g_ceph_context->get_perfcounters_collection()->add(logger);
 }
 
+void OSD::create_recoverystate_perf()
+{
+  dout(10) << "create_recoverystate_perf" << dendl;
+
+  PerfCountersBuilder rs_perf(g_ceph_context, "recoverystate_perf", rs_first, rs_last);
+
+  rs_perf.add_time_avg(rs_initial_latency, "initial_latency");
+  rs_perf.add_time_avg(rs_started_latency, "started_latency");
+  rs_perf.add_time_avg(rs_reset_latency, "reset_latency");
+  rs_perf.add_time_avg(rs_start_latency, "start_latency");
+  rs_perf.add_time_avg(rs_primary_latency, "primary_latency");
+  rs_perf.add_time_avg(rs_peering_latency, "peering_latency");
+  rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency");
+  rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency");
+  rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency");
+  rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency");
+  rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency");
+  rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency");
+  rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency");
+  rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency");
+  rs_perf.add_time_avg(rs_activating_latency, "activating_latency");
+  rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency");
+  rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency");
+  rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency");
+  rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency");
+  rs_perf.add_time_avg(rs_clean_latency, "clean_latency");
+  rs_perf.add_time_avg(rs_active_latency, "active_latency");
+  rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency");
+  rs_perf.add_time_avg(rs_stray_latency, "stray_latency");
+  rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency");
+  rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency");
+  rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency");
+  rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency");
+  rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency");
+  rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency");
+
+  recoverystate_perf = rs_perf.create_perf_counters();
+  g_ceph_context->get_perfcounters_collection()->add(recoverystate_perf);
+}
+
 void OSD::suicide(int exitcode)
 {
   if (g_conf->filestore_blackhole) {
index 82a251d9a80a6d88b80fd308f686dcc38b40908e..ec1ee30f853eed1d399837ecceeb1629f1c69b5c 100644 (file)
@@ -113,10 +113,45 @@ enum {
   l_osd_mape_dup,
 
   l_osd_waiting_for_map,
-  l_osd_peering_latency,
+
   l_osd_last,
 };
 
+// RecoveryState perf counters
+enum {
+  rs_first = 20000,
+  rs_initial_latency,
+  rs_started_latency,
+  rs_reset_latency,
+  rs_start_latency,
+  rs_primary_latency,
+  rs_peering_latency,
+  rs_backfilling_latency,
+  rs_waitremotebackfillreserved_latency,
+  rs_waitlocalbackfillreserved_latency,
+  rs_notbackfilling_latency,
+  rs_repnotrecovering_latency,
+  rs_repwaitrecoveryreserved_latency,
+  rs_repwaitbackfillreserved_latency,
+  rs_RepRecovering_latency,
+  rs_activating_latency,
+  rs_waitlocalrecoveryreserved_latency,
+  rs_waitremoterecoveryreserved_latency,
+  rs_recovering_latency,
+  rs_recovered_latency,
+  rs_clean_latency,
+  rs_active_latency,
+  rs_replicaactive_latency,
+  rs_stray_latency,
+  rs_getinfo_latency,
+  rs_getlog_latency,
+  rs_waitactingchange_latency,
+  rs_incomplete_latency,
+  rs_getmissing_latency,
+  rs_waitupthru_latency,
+  rs_last,
+};
+
 class Messenger;
 class Message;
 class MonClient;
@@ -258,6 +293,7 @@ private:
   Messenger *&client_messenger;
 public:
   PerfCounters *&logger;
+  PerfCounters *&recoverystate_perf;
   MonClient   *&monc;
   ThreadPool::WorkQueueVal<pair<PGRef, OpRequestRef>, PGRef> &op_wq;
   ThreadPool::BatchWorkQueue<PG> &peering_wq;
@@ -593,6 +629,7 @@ protected:
   Messenger   *client_messenger;
   MonClient   *monc;
   PerfCounters      *logger;
+  PerfCounters      *recoverystate_perf;
   ObjectStore *store;
 
   LogClient clog;
@@ -613,6 +650,7 @@ protected:
   int dispatch_running;
 
   void create_logger();
+  void create_recoverystate_perf();
   void tick();
   void _dispatch(Message *m);
   void dispatch_op(OpRequestRef op);
index 8e78eaa7a16d06927e9bf626a4ebcf71c7bc2be3..63be6a34b03af9de11115bd89a2fa5dfbcc63b09 100644 (file)
@@ -5164,6 +5164,9 @@ boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
 void PG::RecoveryState::Initial::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
 }
 
 /*------Started-------*/
@@ -5210,6 +5213,9 @@ boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
 void PG::RecoveryState::Started::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
 }
 
 /*--------Reset---------*/
@@ -5282,6 +5288,9 @@ boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
 void PG::RecoveryState::Reset::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
 }
 
 /*-------Start---------*/
@@ -5304,6 +5313,9 @@ PG::RecoveryState::Start::Start(my_context ctx)
 void PG::RecoveryState::Start::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
 }
 
 /*---------Primary--------*/
@@ -5351,6 +5363,8 @@ void PG::RecoveryState::Primary::exit()
   context< RecoveryMachine >().log_exit(state_name, enter_time);
   PG *pg = context< RecoveryMachine >().pg;
   pg->want_acting.clear();
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
 }
 
 /*---------Peering--------*/
@@ -5438,7 +5452,7 @@ void PG::RecoveryState::Peering::exit()
   pg->clear_probe_targets();
 
   utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
-  pg->osd->logger->tinc(l_osd_peering_latency, dur);
+  pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
 }
 
 
@@ -5475,6 +5489,8 @@ void PG::RecoveryState::Backfilling::exit()
   pg->backfill_reserved = false;
   pg->backfill_reserving = false;
   pg->state_clear(PG_STATE_BACKFILL);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
 }
 
 /*--WaitRemoteBackfillReserved--*/
@@ -5507,6 +5523,9 @@ PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_con
 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
 }
 
 boost::statechart::result
@@ -5549,6 +5568,9 @@ PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_conte
 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
 }
 
 /*----NotBackfilling------*/
@@ -5562,6 +5584,9 @@ PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
 void PG::RecoveryState::NotBackfilling::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
 }
 
 /*---RepNotRecovering----*/
@@ -5575,6 +5600,9 @@ PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
 void PG::RecoveryState::RepNotRecovering::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
 }
 
 /*---RepWaitRecoveryReserved--*/
@@ -5609,6 +5637,9 @@ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &
 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
 }
 
 /*-RepWaitBackfillReserved*/
@@ -5644,6 +5675,9 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
 void PG::RecoveryState::RepWaitBackfillReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
 }
 
 boost::statechart::result
@@ -5689,6 +5723,8 @@ void PG::RecoveryState::RepRecovering::exit()
   context< RecoveryMachine >().log_exit(state_name, enter_time);
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_RepRecovering_latency, dur);
 }
 
 /*------Activating--------*/
@@ -5702,6 +5738,9 @@ PG::RecoveryState::Activating::Activating(my_context ctx)
 void PG::RecoveryState::Activating::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
 }
 
 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
@@ -5721,6 +5760,9 @@ PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_conte
 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
 }
 
 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
@@ -5765,6 +5807,9 @@ PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserve
 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
 }
 
 PG::RecoveryState::Recovering::Recovering(my_context ctx)
@@ -5825,6 +5870,9 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
 void PG::RecoveryState::Recovering::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
 }
 
 PG::RecoveryState::Recovered::Recovered(my_context ctx)
@@ -5856,6 +5904,9 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
 void PG::RecoveryState::Recovered::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
 }
 
 PG::RecoveryState::Clean::Clean(my_context ctx)
@@ -5882,6 +5933,8 @@ void PG::RecoveryState::Clean::exit()
   context< RecoveryMachine >().log_exit(state_name, enter_time);
   PG *pg = context< RecoveryMachine >().pg;
   pg->state_clear(PG_STATE_CLEAN);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
 }
 
 /*---------Active---------*/
@@ -6131,6 +6184,8 @@ void PG::RecoveryState::Active::exit()
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_clear(PG_STATE_RECOVERY_WAIT);
   pg->state_clear(PG_STATE_REPLAY);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
 }
 
 /*------ReplicaActive-----*/
@@ -6222,6 +6277,8 @@ void PG::RecoveryState::ReplicaActive::exit()
   context< RecoveryMachine >().log_exit(state_name, enter_time);
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
 }
 
 /*-------Stray---*/
@@ -6321,6 +6378,9 @@ boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
 void PG::RecoveryState::Stray::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
 }
 
 /*--------GetInfo---------*/
@@ -6493,6 +6553,9 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
 void PG::RecoveryState::GetInfo::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
 }
 
 /*------GetLog------------*/
@@ -6605,6 +6668,9 @@ boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
 void PG::RecoveryState::GetLog::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
 }
 
 /*------WaitActingChange--------*/
@@ -6662,6 +6728,9 @@ boost::statechart::result PG::RecoveryState::WaitActingChange::react(const Query
 void PG::RecoveryState::WaitActingChange::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
 }
 
 /*------Incomplete--------*/
@@ -6697,6 +6766,8 @@ void PG::RecoveryState::Incomplete::exit()
   PG *pg = context< RecoveryMachine >().pg;
 
   pg->state_clear(PG_STATE_INCOMPLETE);
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
 }
 
 /*------GetMissing--------*/
@@ -6819,6 +6890,9 @@ boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState&
 void PG::RecoveryState::GetMissing::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
 }
 
 /*---WaitFlushedPeering---*/
@@ -6897,6 +6971,9 @@ boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState&
 void PG::RecoveryState::WaitUpThru::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  utime_t dur = ceph_clock_now(g_ceph_context) - enter_time;
+  pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
 }
 
 /*----RecoveryState::RecoveryMachine Methods-----*/