From: Ved-vampir Date: Thu, 5 Mar 2015 15:39:57 +0000 (+0300) Subject: os, osd: add perf counters description X-Git-Tag: v9.0.0~141^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=799e05d86a960480cff2ee71643bf1715674a5fd;p=ceph.git os, osd: add perf counters description Sign-off-by: Alyona Kiseleva --- diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index f6c3bb872a2..a723ce80a50 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -579,29 +579,29 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit // initialize logger PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_first, l_os_last); - plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops"); - plb.add_u64(l_os_jq_ops, "journal_queue_ops"); - plb.add_u64_counter(l_os_j_ops, "journal_ops"); - plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes"); - plb.add_u64(l_os_jq_bytes, "journal_queue_bytes"); - plb.add_u64_counter(l_os_j_bytes, "journal_bytes"); - plb.add_time_avg(l_os_j_lat, "journal_latency"); - plb.add_u64_counter(l_os_j_wr, "journal_wr"); - plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes"); - plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops"); - plb.add_u64(l_os_oq_ops, "op_queue_ops"); - plb.add_u64_counter(l_os_ops, "ops"); - plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes"); - plb.add_u64(l_os_oq_bytes, "op_queue_bytes"); - plb.add_u64_counter(l_os_bytes, "bytes"); - plb.add_time_avg(l_os_apply_lat, "apply_latency"); - plb.add_u64(l_os_committing, "committing"); - - plb.add_u64_counter(l_os_commit, "commitcycle"); - plb.add_time_avg(l_os_commit_len, "commitcycle_interval"); - plb.add_time_avg(l_os_commit_lat, "commitcycle_latency"); - plb.add_u64_counter(l_os_j_full, "journal_full"); - plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg"); + plb.add_u64(l_os_jq_max_ops, "journal_queue_max_ops", "Max operations in journal queue"); + plb.add_u64(l_os_jq_ops, "journal_queue_ops", "Operations in journal queue"); + plb.add_u64_counter(l_os_j_ops, "journal_ops", "Total journal entries written"); + plb.add_u64(l_os_jq_max_bytes, "journal_queue_max_bytes", "Max data in journal queue"); + plb.add_u64(l_os_jq_bytes, "journal_queue_bytes", "Size of journal queue"); + plb.add_u64_counter(l_os_j_bytes, "journal_bytes", "Total operations size in journal"); + plb.add_time_avg(l_os_j_lat, "journal_latency", "Average journal queue completing latency"); + plb.add_u64_counter(l_os_j_wr, "journal_wr", "Journal write IOs"); + plb.add_u64_avg(l_os_j_wr_bytes, "journal_wr_bytes", "Journal data written"); + plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); + plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations in writing to FS queue"); + plb.add_u64_counter(l_os_ops, "ops", "Operations written to store"); + plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); + plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of writing to FS queue"); + plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); + plb.add_u64(l_os_committing, "committing", "Is currently committing"); + + plb.add_u64_counter(l_os_commit, "commitcycle", "Commit cycles"); + plb.add_time_avg(l_os_commit_len, "commitcycle_interval", "Average interval between commits"); + plb.add_time_avg(l_os_commit_lat, "commitcycle_latency", "Average latency of commit"); + plb.add_u64_counter(l_os_j_full, "journal_full", "Journal writes while full"); + plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); logger = plb.create_perf_counters(); diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc index 85bca32e568..03374568297 100644 --- a/src/os/WBThrottle.cc +++ b/src/os/WBThrottle.cc @@ -22,12 +22,12 @@ WBThrottle::WBThrottle(CephContext *cct) : PerfCountersBuilder b( cct, string("WBThrottle"), l_wbthrottle_first, l_wbthrottle_last); - b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied"); - b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb"); - b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied"); - b.add_u64(l_wbthrottle_ios_wb, "ios_wb"); - b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied"); - b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb"); + b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data"); + b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data"); + b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations"); + b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations"); + b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write"); + b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries"); logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 35854f56d30..0fd3ec9eab4 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2106,18 +2106,18 @@ void OSD::create_logger() osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency", "Latency of read-modify-write operation (excluding queue time)"); // client rmw process latency - osd_plb.add_u64_counter(l_osd_sop, "subop"); // subops - osd_plb.add_u64_counter(l_osd_sop_inb, "subop_in_bytes"); // subop in bytes - osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency"); // subop latency - - osd_plb.add_u64_counter(l_osd_sop_w, "subop_w"); // replicated (client) writes - osd_plb.add_u64_counter(l_osd_sop_w_inb, "subop_w_in_bytes"); // replicated write in bytes - osd_plb.add_time_avg(l_osd_sop_w_lat, "subop_w_latency"); // replicated write latency - osd_plb.add_u64_counter(l_osd_sop_pull, "subop_pull"); // pull request - osd_plb.add_time_avg(l_osd_sop_pull_lat, "subop_pull_latency"); - osd_plb.add_u64_counter(l_osd_sop_push, "subop_push"); // push (write) - osd_plb.add_u64_counter(l_osd_sop_push_inb, "subop_push_in_bytes"); - osd_plb.add_time_avg(l_osd_sop_push_lat, "subop_push_latency"); + osd_plb.add_u64_counter(l_osd_sop, "subop", "Suboperations"); // subops + osd_plb.add_u64_counter(l_osd_sop_inb, "subop_in_bytes", "Suboperations total size"); // subop in bytes + osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency"); // subop latency + + osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes"); // replicated (client) writes + osd_plb.add_u64_counter(l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size"); // replicated write in bytes + osd_plb.add_time_avg(l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency"); // replicated write latency + osd_plb.add_u64_counter(l_osd_sop_pull, "subop_pull", "Suboperations pull requests"); // pull request + osd_plb.add_time_avg(l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency"); + osd_plb.add_u64_counter(l_osd_sop_push, "subop_push", "Suboperations push messages"); // push (write) + osd_plb.add_u64_counter(l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size"); + osd_plb.add_time_avg(l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency"); osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent"); // pull requests sent osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent"); // push messages @@ -2138,37 +2138,36 @@ void OSD::create_logger() osd_plb.add_u64(l_osd_pg_stray, "numpg_stray", "Placement groups ready to be deleted from this osd"); // num stray pgs osd_plb.add_u64(l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to"); // heartbeat peers we send to osd_plb.add_u64(l_osd_hb_from, "heartbeat_from_peers", "Heartbeat (ping) peers we recv from"); // heartbeat peers we recv from - osd_plb.add_u64_counter(l_osd_map, "map_messages"); // osdmap messages - osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs"); // osdmap epochs - osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups"); // dup osdmap epochs - osd_plb.add_u64_counter(l_osd_waiting_for_map, - "messages_delayed_for_map"); // dup osdmap epochs - - osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes"); - osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used"); - osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail"); - - osd_plb.add_u64_counter(l_osd_copyfrom, "copyfrom"); - - osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote"); - osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush"); - osd_plb.add_u64_counter(l_osd_tier_flush_fail, "tier_flush_fail"); - osd_plb.add_u64_counter(l_osd_tier_try_flush, "tier_try_flush"); - osd_plb.add_u64_counter(l_osd_tier_try_flush_fail, "tier_try_flush_fail"); - osd_plb.add_u64_counter(l_osd_tier_evict, "tier_evict"); - osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout"); - osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty"); - osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean"); - osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay"); - osd_plb.add_u64_counter(l_osd_tier_proxy_read, "tier_proxy_read"); - - osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake"); - osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip"); - osd_plb.add_u64_counter(l_osd_agent_flush, "agent_flush"); - osd_plb.add_u64_counter(l_osd_agent_evict, "agent_evict"); - - osd_plb.add_u64_counter(l_osd_object_ctx_cache_hit, "object_ctx_cache_hit"); - osd_plb.add_u64_counter(l_osd_object_ctx_cache_total, "object_ctx_cache_total"); + osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages"); // osdmap messages + osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs"); // osdmap epochs + osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); // dup osdmap epochs + osd_plb.add_u64_counter(l_osd_waiting_for_map, "messages_delayed_for_map", "Operations waiting for OSD map"); // dup osdmap epochs + + osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size"); + osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space"); + osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space"); + + osd_plb.add_u64_counter(l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations"); + + osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions"); + osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes"); + osd_plb.add_u64_counter(l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes"); + osd_plb.add_u64_counter(l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts"); + osd_plb.add_u64_counter(l_osd_tier_try_flush_fail, "tier_try_flush_fail", "Failed tier flush attempts"); + osd_plb.add_u64_counter(l_osd_tier_evict, "tier_evict", "Tier evictions"); + osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts"); + osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set"); + osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned"); + osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)"); + osd_plb.add_u64_counter(l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads"); + + osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake", "Tiering agent wake up"); + osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip", "Objects skipped by agent"); + osd_plb.add_u64_counter(l_osd_agent_flush, "agent_flush", "Tiering agent flushes"); + osd_plb.add_u64_counter(l_osd_agent_evict, "agent_evict", "Tiering agent evictions"); + + osd_plb.add_u64_counter(l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits"); + osd_plb.add_u64_counter(l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups"); logger = osd_plb.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); @@ -2180,35 +2179,35 @@ void OSD::create_recoverystate_perf() PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last); - rs_perf.add_time_avg(rs_initial_latency, "initial_latency"); - rs_perf.add_time_avg(rs_started_latency, "started_latency"); - rs_perf.add_time_avg(rs_reset_latency, "reset_latency"); - rs_perf.add_time_avg(rs_start_latency, "start_latency"); - rs_perf.add_time_avg(rs_primary_latency, "primary_latency"); - rs_perf.add_time_avg(rs_peering_latency, "peering_latency"); - rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency"); - rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency"); - rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency"); - rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency"); - rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency"); - rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency"); - rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency"); - rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency"); - rs_perf.add_time_avg(rs_activating_latency, "activating_latency"); - rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency"); - rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency"); - rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency"); - rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency"); - rs_perf.add_time_avg(rs_clean_latency, "clean_latency"); - rs_perf.add_time_avg(rs_active_latency, "active_latency"); - rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency"); - rs_perf.add_time_avg(rs_stray_latency, "stray_latency"); - rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency"); - rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency"); - rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency"); - rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency"); - rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency"); - rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency"); + rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency"); + rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency"); + rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency"); + rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency"); + rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency"); + rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency"); + rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency"); + rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency"); + rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency"); + rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency"); + rs_perf.add_time_avg(rs_RepRecovering_latency, "RepRecovering_latency", "RepRecovering recovery state latency"); + rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency"); + rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency"); + rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency"); + rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency"); + rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency"); + rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency"); + rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency"); + rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency"); + rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency"); + rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency"); + rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency"); + rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency"); + rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency"); + rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency"); recoverystate_perf = rs_perf.create_perf_counters(); cct->get_perfcounters_collection()->add(recoverystate_perf);