From: Varada Kari Date: Thu, 12 Jan 2017 08:21:02 +0000 (+0530) Subject: os/bluestore: interface to dump key value distribution in kvdb X-Git-Tag: v12.0.0~97^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=64af217cd42f1a007581af0983a2e7e5ba8a52a1;p=ceph.git os/bluestore: interface to dump key value distribution in kvdb Adding a asok to command to dump the key value distribution in a histogram fashion. Signed-off-by: Varada Kari --- diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 5b97598ebe12..20fb3e3f37c0 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -370,7 +370,7 @@ void RocksDBStore::close() cct->get_perfcounters_collection()->remove(logger); } -void RocksDBStore::split(const std::string &s, char delim, std::vector &elems) { +void RocksDBStore::split_stats(const std::string &s, char delim, std::vector &elems) { std::stringstream ss; ss.str(s); std::string item; @@ -379,12 +379,6 @@ void RocksDBStore::split(const std::string &s, char delim, std::vector RocksDBStore::split(const std::string &s, char delim) { - std::vector elems; - split(s, delim, elems); - return elems; -} - void RocksDBStore::get_statistics(Formatter *f) { if (!g_conf->rocksdb_perf) { @@ -400,7 +394,7 @@ void RocksDBStore::get_statistics(Formatter *f) f->open_object_section("rocksdb_statistics"); f->dump_string("rocksdb_compaction_statistics", ""); vector stats; - split(stat_str, '\n', stats); + split_stats(stat_str, '\n', stats); for (auto st :stats) { f->dump_string("", st); } @@ -412,7 +406,7 @@ void RocksDBStore::get_statistics(Formatter *f) f->open_object_section("rocksdb_extended_statistics"); string stat_str = dbstats->ToString(); vector stats; - split(stat_str, '\n', stats); + split_stats(stat_str, '\n', stats); f->dump_string("rocksdb_extended_statistics", ""); for (auto st :stats) { f->dump_string(".", st); diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 44dce9c92f32..fa3dc8dc1fb4 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -150,8 +150,7 @@ public: void close(); - void split(const std::string &s, char delim, std::vector &elems); - std::vector split(const std::string &s, char delim); + void split_stats(const std::string &s, char delim, std::vector &elems); void get_statistics(Formatter *f); struct RocksWBHandler: public rocksdb::WriteBatch::Handler { @@ -313,8 +312,8 @@ public: bufferlist value(); bufferptr value_as_ptr(); int status(); - size_t key_size(); - size_t value_size(); + size_t key_size() override; + size_t value_size() override; }; /// Utility diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index befbf102f389..6b12bbcebbee 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -1501,6 +1501,7 @@ public: } virtual void get_db_statistics(Formatter *f) { } + virtual void generate_db_histogram(Formatter *f) { } virtual string get_type() = 0; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 9eb88409e460..832e23a65ebe 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -9118,7 +9118,171 @@ int BlueStore::_split_collection(TransContext *txc, return r; } +// DB key value Histogram +#define KEY_SLAB 32 +#define VALUE_SLAB 64 +const string prefix_onode = "o"; +const string prefix_onode_shard = "x"; +const string prefix_other = "Z"; +int BlueStore::DBHistogram::get_key_slab(size_t sz) +{ + return (sz/KEY_SLAB); +} + +string BlueStore::DBHistogram::get_key_slab_to_range(int slab) +{ + int lower_bound = slab * KEY_SLAB; + int upper_bound = (slab + 1) * KEY_SLAB; + string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; + return ret; +} + +int BlueStore::DBHistogram::get_value_slab(size_t sz) +{ + return (sz/VALUE_SLAB); +} + +string BlueStore::DBHistogram::get_value_slab_to_range(int slab) +{ + int lower_bound = slab * VALUE_SLAB; + int upper_bound = (slab + 1) * VALUE_SLAB; + string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")"; + return ret; +} + +void BlueStore::DBHistogram::update_hist_entry(map > &key_hist, + const string &prefix, size_t key_size, size_t value_size) +{ + uint32_t key_slab = get_key_slab(key_size); + uint32_t value_slab = get_value_slab(value_size); + key_hist[prefix][key_slab].count++; + key_hist[prefix][key_slab].max_len = MAX(key_size, key_hist[prefix][key_slab].max_len); + key_hist[prefix][key_slab].val_map[value_slab].count++; + key_hist[prefix][key_slab].val_map[value_slab].max_len = + MAX(value_size, key_hist[prefix][key_slab].val_map[value_slab].max_len); +} + +void BlueStore::DBHistogram::dump(Formatter *f) +{ + f->open_object_section("rocksdb_value_distribution"); + for (auto i : value_hist) { + f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second); + } + f->close_section(); + f->open_object_section("rocksdb_key_value_histogram"); + for (auto i : key_hist) { + f->dump_string("prefix", i.first); + f->open_object_section("key_hist"); + for ( auto k : i.second) { + f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count); + f->dump_unsigned("max_len", k.second.max_len); + f->open_object_section("value_hist"); + for ( auto j : k.second.val_map) { + f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count); + f->dump_unsigned("max_len", j.second.max_len); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} + +//Itrerates through the db and collects the stats +void BlueStore::generate_db_histogram(Formatter *f) +{ + //globals + uint64_t num_onodes = 0; + uint64_t num_shards = 0; + uint64_t num_super = 0; + uint64_t num_coll = 0; + uint64_t num_omap = 0; + uint64_t num_wal = 0; + uint64_t num_alloc = 0; + uint64_t num_stat = 0; + uint64_t num_others = 0; + uint64_t num_shared_shards = 0; + size_t max_key_size =0, max_value_size = 0; + uint64_t total_key_size = 0, total_value_size = 0; + size_t key_size = 0, value_size = 0; + DBHistogram hist; + + utime_t start = ceph_clock_now(); + + KeyValueDB::WholeSpaceIterator iter = db->get_iterator(); + iter->seek_to_first(); + while (iter->valid()) { + dout(30) << __func__ << " Key: " << iter->key() << dendl; + key_size = iter->key_size(); + value_size = iter->value_size(); + hist.value_hist[hist.get_value_slab(value_size)]++; + max_key_size = MAX(max_key_size, key_size); + max_value_size = MAX(max_value_size, value_size); + total_key_size += key_size; + total_value_size += value_size; + + pair key(iter->raw_key()); + + if (key.first == PREFIX_SUPER) { + hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size); + num_super++; + } else if (key.first == PREFIX_STAT) { + hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size); + num_stat++; + } else if (key.first == PREFIX_COLL) { + hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size); + num_coll++; + } else if (key.first == PREFIX_OBJ) { + if (key.second.back() == ONODE_KEY_SUFFIX) { + hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size); + num_onodes++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size); + num_shards++; + } + } else if (key.first == PREFIX_OMAP) { + hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size); + num_omap++; + } else if (key.first == PREFIX_WAL) { + hist.update_hist_entry(hist.key_hist, PREFIX_WAL, key_size, value_size); + num_wal++; + } else if (key.first == PREFIX_ALLOC || key.first == "b" ) { + hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size); + num_alloc++; + } else if (key.first == PREFIX_SHARED_BLOB) { + hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size); + num_shared_shards++; + } else { + hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size); + num_others++; + } + iter->next(); + } + + utime_t duration = ceph_clock_now() - start; + f->open_object_section("rocksdb_key_value_stats"); + f->dump_unsigned("num_onodes", num_onodes); + f->dump_unsigned("num_shards", num_shards); + f->dump_unsigned("num_super", num_super); + f->dump_unsigned("num_coll", num_coll); + f->dump_unsigned("num_omap", num_omap); + f->dump_unsigned("num_wal", num_wal); + f->dump_unsigned("num_alloc", num_alloc); + f->dump_unsigned("num_stat", num_stat); + f->dump_unsigned("num_shared_shards", num_shared_shards); + f->dump_unsigned("num_others", num_others); + f->dump_unsigned("max_key_size", max_key_size); + f->dump_unsigned("max_value_size", max_value_size); + f->dump_unsigned("total_key_size", total_key_size); + f->dump_unsigned("total_value_size", total_value_size); + f->close_section(); + + hist.dump(f); + + dout(20) << __func__ << " finished in " << duration << " seconds" << dendl; + +} // =========================================== diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index d74f9256d01a..a9df542584d3 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1496,6 +1496,29 @@ public: } }; + struct DBHistogram { + struct value_dist { + uint64_t count; + uint32_t max_len; + }; + + struct key_dist { + uint64_t count; + uint32_t max_len; + map val_map; ///< slab id to count, max length of value and key + }; + + map > key_hist; + map value_hist; + int get_key_slab(size_t sz); + string get_key_slab_to_range(int slab); + int get_value_slab(size_t sz); + string get_value_slab_to_range(int slab); + void update_hist_entry(map > &key_hist, + const string &prefix, size_t key_size, size_t value_size); + void dump(Formatter *f); + }; + // -------------------------------------------------------- // members private: @@ -1770,7 +1793,8 @@ public: return 0; } - void get_db_statistics(Formatter *f); + void get_db_statistics(Formatter *f) override; + void generate_db_histogram(Formatter *f) override; public: int statfs(struct store_statfs_t *buf) override; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 0347b15117d9..017e6ebbbf76 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1939,6 +1939,8 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format, store->get_db_statistics(f); } else if (command == "dump_scrubs") { service.dumps_scrub(f); + } else if (command == "calc_objectstore_db_histogram") { + store->generate_db_histogram(f); } else { assert(0 == "broken asok registration"); } @@ -2409,6 +2411,10 @@ void OSD::final_init() "print scheduled scrubs"); assert(r == 0); + r = admin_socket->register_command("calc_objectstore_db_histogram", "calc_objectstore_db_histogram", asok_hook, + "Generate key value histogram of kvdb(rocksdb) which used by bluestore"); + assert(r == 0); + test_ops_hook = new TestOpsSocketHook(&(this->service), this->store); // Note: pools are CephString instead of CephPoolname because // these commands traditionally support both pool names and numbers @@ -2734,6 +2740,7 @@ int OSD::shutdown() cct->get_admin_socket()->unregister_command("set_heap_property"); cct->get_admin_socket()->unregister_command("get_heap_property"); cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats"); + cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram"); delete asok_hook; asok_hook = NULL;