From 9e21ef0e40ff89b10dfdf857c703336496b3171f Mon Sep 17 00:00:00 2001 From: Brad Hubbard Date: Tue, 11 Dec 2018 14:00:35 +1000 Subject: [PATCH] osd: Implement lazy omap usage statistics Opportunistic gathering of omap statistics during deep scrub. Signed-off-by: Brad Hubbard --- doc/dev/placement-group.rst | 20 ++++++++++++++++++++ src/mon/PGMap.cc | 35 +++++++++++++++++++++++++++++++++++ src/osd/PG.cc | 13 ++++++++++--- src/osd/PG.h | 5 +++-- src/osd/PGBackend.cc | 20 +++++++++++--------- src/osd/PGBackend.h | 4 ++-- src/osd/ReplicatedBackend.cc | 9 +++++++++ src/osd/osd_types.cc | 34 ++++++++++++++++++++++++++++------ src/osd/osd_types.h | 18 ++++++++++++++++++ 9 files changed, 136 insertions(+), 22 deletions(-) diff --git a/doc/dev/placement-group.rst b/doc/dev/placement-group.rst index 1be1e7e47bb..ac58c984db0 100644 --- a/doc/dev/placement-group.rst +++ b/doc/dev/placement-group.rst @@ -152,3 +152,23 @@ User-visible PG States happens when pg_num_pending < pg_num, and applies to the PGs with pg_num_pending <= ps < pg_num as well as the corresponding peer PG that it is merging with. + + +OMAP STATISTICS +=============== + +Omap statistics are gathered during deep scrub and displayed in the output of +the following commands:: + + ceph pg dump + ceph pg dump all + ceph pg dump summary + ceph pg dump pgs + ceph pg dump pools + ceph pg ls + +As these statistics are not updated continuously they may be quite inaccurate in +an environment where deep scrubs are run infrequently and/or there is a lot of +omap activity. As such they should not be relied on for exact accuracy but +rather used as a guide. Running a deep scrub and checking these statistics +immediately afterwards should give a good indication of current omap usage. diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 1ea4b2120be..382a049d578 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -1575,6 +1575,8 @@ void PGMap::dump_pg_stats_plain( tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); @@ -1614,6 +1616,8 @@ void PGMap::dump_pg_stats_plain( << st.stats.sum.num_objects_misplaced << st.stats.sum.num_objects_unfound << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys << st.log_size << st.ondisk_log_size << pg_state_string(st.state) @@ -1670,6 +1674,8 @@ void PGMap::dump_pool_stats(ostream& ss, bool header) const tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); } else { @@ -1682,6 +1688,8 @@ void PGMap::dump_pool_stats(ostream& ss, bool header) const tab.define_column("", TextTable::LEFT, TextTable::RIGHT); tab.define_column("", TextTable::LEFT, TextTable::RIGHT); tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); } for (auto p = pg_pool_sum.begin(); @@ -1694,6 +1702,8 @@ void PGMap::dump_pool_stats(ostream& ss, bool header) const << p->second.stats.sum.num_objects_misplaced << p->second.stats.sum.num_objects_unfound << p->second.stats.sum.num_bytes + << p->second.stats.sum.num_omap_bytes + << p->second.stats.sum.num_omap_keys << p->second.log_size << p->second.ondisk_log_size << TextTable::endrow; @@ -1714,6 +1724,8 @@ void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); } else { @@ -1726,6 +1738,8 @@ void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const tab.define_column("", TextTable::LEFT, TextTable::RIGHT); tab.define_column("", TextTable::LEFT, TextTable::RIGHT); tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); }; tab << "sum" @@ -1735,6 +1749,8 @@ void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const << pg_sum.stats.sum.num_objects_misplaced << pg_sum.stats.sum.num_objects_unfound << pg_sum.stats.sum.num_bytes + << pg_sum.stats.sum.num_omap_bytes + << pg_sum.stats.sum.num_omap_keys << pg_sum.log_size << pg_sum.ondisk_log_size << TextTable::endrow; @@ -2187,6 +2203,8 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set& pgs) const tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT); @@ -2212,6 +2230,8 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set& pgs) const << st.stats.sum.num_objects_misplaced << st.stats.sum.num_objects_unfound << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys << st.log_size << pg_state_string(st.state) << utimespan_str(now - st.last_change) @@ -3168,6 +3188,13 @@ int process_pg_map_command( string prefix = orig_prefix; auto cmdmap = orig_cmdmap; + string omap_stats_note = + "\n* NOTE: Omap statistics are gathered during deep scrub and " + "may be inaccurate soon afterwards depending on utilisation. See " + "http://docs.ceph.com/docs/master/dev/placement-group/#omap-statistics " + "for further details.\n"; + bool omap_stats_note_required = false; + // perhaps these would be better in the parsing, but it's weird bool primary = false; if (prefix == "pg dump_json") { @@ -3261,10 +3288,12 @@ int process_pg_map_command( } else { if (what.count("all")) { pg_map.dump(ds); + omap_stats_note_required = true; } else if (what.count("summary") || what.count("sum")) { pg_map.dump_basic(ds); pg_map.dump_pg_sum_stats(ds, true); pg_map.dump_osd_sum_stats(ds); + omap_stats_note_required = true; } else { if (what.count("pgs_brief")) { pg_map.dump_pg_stats(ds, true); @@ -3273,15 +3302,20 @@ int process_pg_map_command( if (what.count("pgs")) { pg_map.dump_pg_stats(ds, false); header = false; + omap_stats_note_required = true; } if (what.count("pools")) { pg_map.dump_pool_stats(ds, header); + omap_stats_note_required = true; } if (what.count("osds")) { pg_map.dump_osd_stats(ds); } } odata->append(ds); + if (omap_stats_note_required) { + odata->append(omap_stats_note); + } } *ss << "dumped " << what; return 0; @@ -3335,6 +3369,7 @@ int process_pg_map_command( } else if (!pgs.empty()) { pg_map.dump_filtered_pg_stats(ds, pgs); odata->append(ds); + odata->append(omap_stats_note); } return 0; } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 0ea49cd8147..dec077459fe 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5514,8 +5514,9 @@ void PG::scrub_compare_maps() } stringstream ss; - get_pgbackend()->be_large_omap_check(maps, master_set, - scrubber.large_omap_objects, ss); + get_pgbackend()->be_omap_checks(maps, master_set, + scrubber.omap_stats, ss); + if (!ss.str().empty()) { osd->clog->warn(ss); } @@ -5714,7 +5715,13 @@ void PG::scrub_finish() info.history.last_clean_scrub_stamp = now; info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors; - info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects; + info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects; + info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes; + info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys; + dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = " + << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = " + << info.stats.stats.sum.num_omap_keys << dendl; + publish_stats_to_osd(); } else { info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent diff --git a/src/osd/PG.h b/src/osd/PG.h index d5dcf43ef85..e8a3a3c5eab 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1672,7 +1672,6 @@ public: set waiting_on_whom; int shallow_errors; int deep_errors; - int large_omap_objects = 0; int fixed; ScrubMap primary_scrubmap; ScrubMapBuilder primary_scrubmap_pos; @@ -1683,6 +1682,8 @@ public: OpRequestRef active_rep_scrub; utime_t scrub_reg_stamp; // stamp we registered for + omap_stat_t omap_stats = (const struct omap_stat_t){ 0 }; + // For async sleep bool sleeping = false; bool needs_sleep = true; @@ -1816,8 +1817,8 @@ public: subset_last_update = eversion_t(); shallow_errors = 0; deep_errors = 0; - large_omap_objects = 0; fixed = 0; + omap_stats = (const struct omap_stat_t){ 0 }; deep = false; run_callbacks(); inconsistent.clear(); diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc index 5d59f58e15b..62cfed4ed5d 100644 --- a/src/osd/PGBackend.cc +++ b/src/osd/PGBackend.cc @@ -1227,32 +1227,34 @@ out: } } -void PGBackend::be_large_omap_check(const map &maps, +void PGBackend::be_omap_checks(const map &maps, const set &master_set, - int& large_omap_objects, + omap_stat_t& omap_stats, ostream &warnstream) const { - bool needs_check = false; + bool needs_omap_check = false; for (const auto& map : maps) { - if (map.second->has_large_omap_object_errors) { - needs_check = true; + if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) { + needs_omap_check = true; break; } } - if (!needs_check) { - return; + if (!needs_omap_check) { + return; // Nothing to do } - // Iterate through objects and check large omap object flag + // Iterate through objects and update omap stats for (const auto& k : master_set) { for (const auto& map : maps) { auto it = map.second->objects.find(k); if (it == map.second->objects.end()) continue; ScrubMap::object& obj = it->second; + omap_stats.omap_bytes += obj.object_omap_bytes; + omap_stats.omap_keys += obj.object_omap_keys; if (obj.large_omap_object_found) { - large_omap_objects++; + omap_stats.large_omap_objects++; warnstream << "Large omap object found. Object: " << k << " Key count: " << obj.large_omap_object_key_count << " Size (bytes): " << obj.large_omap_object_value_size << '\n'; diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index a0ae51d4f9e..6fdf6dd05b5 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -607,10 +607,10 @@ typedef std::shared_ptr OSDMapRef; ScrubMap &map, ScrubMapBuilder &pos, ScrubMap::object &o) = 0; - void be_large_omap_check( + void be_omap_checks( const map &maps, const set &master_set, - int& large_omap_objects, + omap_stat_t& omap_stats, ostream &warnstream) const; static PGBackend *build_pg_backend( diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 7a585b19bad..d471e6b7a09 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -719,6 +719,15 @@ int ReplicatedBackend::be_deep_scrub( dout(20) << __func__ << " done with " << poid << " omap_digest " << std::hex << o.omap_digest << std::dec << dendl; + // Sum up omap usage + if (pos.omap_keys > 0 || pos.omap_bytes > 0) { + dout(25) << __func__ << " adding " << pos.omap_keys << " keys and " + << pos.omap_bytes << " bytes to pg_stats sums" << dendl; + map.has_omap_keys = true; + o.object_omap_bytes = pos.omap_bytes; + o.object_omap_keys = pos.omap_keys; + } + // done! return 0; } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2a531b2ab3d..2216cb2f5a9 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2220,11 +2220,13 @@ void object_stat_sum_t::dump(Formatter *f) const f->dump_int("num_legacy_snapsets", num_legacy_snapsets); f->dump_int("num_large_omap_objects", num_large_omap_objects); f->dump_int("num_objects_manifest", num_objects_manifest); + f->dump_int("num_omap_bytes", num_omap_bytes); + f->dump_int("num_omap_keys", num_omap_keys); } void object_stat_sum_t::encode(bufferlist& bl) const { - ENCODE_START(18, 14, bl); + ENCODE_START(19, 14, bl); #if defined(CEPH_LITTLE_ENDIAN) bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t)); #else @@ -2265,6 +2267,8 @@ void object_stat_sum_t::encode(bufferlist& bl) const encode(num_legacy_snapsets, bl); encode(num_large_omap_objects, bl); encode(num_objects_manifest, bl); + encode(num_omap_bytes, bl); + encode(num_omap_keys, bl); #endif ENCODE_FINISH(bl); } @@ -2272,9 +2276,9 @@ void object_stat_sum_t::encode(bufferlist& bl) const void object_stat_sum_t::decode(bufferlist::const_iterator& bl) { bool decode_finish = false; - DECODE_START(18, bl); // make sure to also update fast decode below + DECODE_START(19, bl); // make sure to also update fast decode below #if defined(CEPH_LITTLE_ENDIAN) - if (struct_v >= 18) { // this must match newest decode version + if (struct_v >= 19) { // this must match newest decode version bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes)); decode_finish = true; } @@ -2325,6 +2329,10 @@ void object_stat_sum_t::decode(bufferlist::const_iterator& bl) if (struct_v >= 18) { decode(num_objects_manifest, bl); } + if (struct_v >= 19) { + decode(num_omap_bytes, bl); + decode(num_omap_keys, bl); + } } DECODE_FINISH(bl); } @@ -2366,6 +2374,8 @@ void object_stat_sum_t::generate_test_instances(list& o) a.num_objects_pinned = 20; a.num_large_omap_objects = 5; a.num_objects_manifest = 2; + a.num_omap_bytes = 20000; + a.num_omap_keys = 200; o.push_back(new object_stat_sum_t(a)); } @@ -2408,6 +2418,8 @@ void object_stat_sum_t::add(const object_stat_sum_t& o) num_legacy_snapsets += o.num_legacy_snapsets; num_large_omap_objects += o.num_large_omap_objects; num_objects_manifest += o.num_objects_manifest; + num_omap_bytes += o.num_omap_bytes; + num_omap_keys += o.num_omap_keys; } void object_stat_sum_t::sub(const object_stat_sum_t& o) @@ -2449,6 +2461,8 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o) num_legacy_snapsets -= o.num_legacy_snapsets; num_large_omap_objects -= o.num_large_omap_objects; num_objects_manifest -= o.num_objects_manifest; + num_omap_bytes -= o.num_omap_bytes; + num_omap_keys -= o.num_omap_keys; } bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r) @@ -2490,7 +2504,9 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r) l.num_objects_pinned == r.num_objects_pinned && l.num_legacy_snapsets == r.num_legacy_snapsets && l.num_large_omap_objects == r.num_large_omap_objects && - l.num_objects_manifest == r.num_objects_manifest; + l.num_objects_manifest == r.num_objects_manifest && + l.num_omap_bytes == r.num_omap_bytes && + l.num_omap_keys == r.num_omap_keys; } // -- object_stat_collection_t -- @@ -6102,7 +6118,7 @@ void ScrubMap::generate_test_instances(list& o) void ScrubMap::object::encode(bufferlist& bl) const { bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch; - ENCODE_START(9, 7, bl); + ENCODE_START(10, 7, bl); encode(size, bl); encode(negative, bl); encode(attrs, bl); @@ -6120,12 +6136,14 @@ void ScrubMap::object::encode(bufferlist& bl) const encode(large_omap_object_found, bl); encode(large_omap_object_key_count, bl); encode(large_omap_object_value_size, bl); + encode(object_omap_bytes, bl); + encode(object_omap_keys, bl); ENCODE_FINISH(bl); } void ScrubMap::object::decode(bufferlist::const_iterator& bl) { - DECODE_START(9, bl); + DECODE_START(10, bl); decode(size, bl); bool tmp, compat_read_error = false; decode(tmp, bl); @@ -6163,6 +6181,10 @@ void ScrubMap::object::decode(bufferlist::const_iterator& bl) decode(large_omap_object_key_count, bl); decode(large_omap_object_value_size, bl); } + if (struct_v >= 10) { + decode(object_omap_bytes, bl); + decode(object_omap_keys, bl); + } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 4d6267e6129..bfde9710e0b 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1734,6 +1734,8 @@ struct object_stat_sum_t { int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets int64_t num_large_omap_objects = 0; int64_t num_objects_manifest = 0; + int64_t num_omap_bytes = 0; + int64_t num_omap_keys = 0; object_stat_sum_t() : num_bytes(0), @@ -1782,6 +1784,8 @@ struct object_stat_sum_t { FLOOR(num_wr_kb); FLOOR(num_large_omap_objects); FLOOR(num_objects_manifest); + FLOOR(num_omap_bytes); + FLOOR(num_omap_keys); FLOOR(num_shallow_scrub_errors); FLOOR(num_deep_scrub_errors); num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors; @@ -1838,6 +1842,8 @@ struct object_stat_sum_t { SPLIT(num_wr_kb); SPLIT(num_large_omap_objects); SPLIT(num_objects_manifest); + SPLIT(num_omap_bytes); + SPLIT(num_omap_keys); SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors); SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors); for (unsigned i = 0; i < out.size(); ++i) { @@ -1900,6 +1906,8 @@ struct object_stat_sum_t { sizeof(num_scrub_errors) + sizeof(num_large_omap_objects) + sizeof(num_objects_manifest) + + sizeof(num_omap_bytes) + + sizeof(num_omap_keys) + sizeof(num_objects_recovered) + sizeof(num_bytes_recovered) + sizeof(num_keys_recovered) + @@ -5331,6 +5339,8 @@ struct ScrubMap { bool large_omap_object_found:1; uint64_t large_omap_object_key_count = 0; uint64_t large_omap_object_value_size = 0; + uint64_t object_omap_bytes = 0; + uint64_t object_omap_keys = 0; object() : // Init invalid size so it won't match if we get a stat EIO error @@ -5350,6 +5360,7 @@ struct ScrubMap { eversion_t valid_through; eversion_t incr_since; bool has_large_omap_object_errors:1; + bool has_omap_keys:1; void merge_incr(const ScrubMap &l); void clear_from(const hobject_t& start) { @@ -5802,4 +5813,11 @@ struct pool_pg_num_history_t { }; WRITE_CLASS_ENCODER(pool_pg_num_history_t) +// omap specific stats +struct omap_stat_t { + int large_omap_objects; + int64_t omap_bytes; + int64_t omap_keys; +}; + #endif -- 2.47.3