From 69873feb4a28489118b4a97f911f743f6360b60f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 31 May 2018 14:37:48 -0500 Subject: [PATCH] osd: collect and record pg_num changes by pool This will simplify our identification of split and merge events. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 35 ++++++++++++- src/osd/OSD.h | 6 +++ src/osd/osd_types.h | 88 +++++++++++++++++++++++++++++++++ src/tools/ceph-dencoder/types.h | 1 + 4 files changed, 129 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index cf9c457c60b02..360c1617c2914 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3884,6 +3884,17 @@ void OSD::load_pgs() ceph_assert(osd_lock.is_locked()); dout(0) << "load_pgs" << dendl; + { + auto pghist = make_pg_num_history_oid(); + bufferlist bl; + int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0); + if (r >= 0 && bl.length() > 0) { + auto p = bl.cbegin(); + decode(pg_num_history, p); + } + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } + vector ls; int r = store->list_collections(ls); if (r < 0) { @@ -7515,6 +7526,7 @@ void OSD::handle_osd_map(MOSDMap *m) if (superblock.oldest_map) { // make sure we at least keep pace with incoming maps trim_maps(m->oldest_map, last - first + 1, skip_maps); + pg_num_history.prune(superblock.oldest_map); } if (!superblock.oldest_map || skip_maps) @@ -7529,7 +7541,7 @@ void OSD::handle_osd_map(MOSDMap *m) superblock.clean_thru = last; } - // check for deleted pools + // check for pg_num changes and deleted pools OSDMapRef lastmap; for (auto& i : added_maps) { if (!lastmap) { @@ -7542,6 +7554,7 @@ void OSD::handle_osd_map(MOSDMap *m) ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch()); for (auto& j : lastmap->get_pools()) { if (!i.second->have_pg_pool(j.first)) { + pg_num_history.log_pool_delete(i.first, j.first); dout(10) << __func__ << " recording final pg_pool_t for pool " << j.first << dendl; // this information is needed by _make_pg() if have to restart before @@ -7559,10 +7572,30 @@ void OSD::handle_osd_map(MOSDMap *m) encode(profile, bl); t.write(coll_t::meta(), obj, 0, bl.length(), bl); service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num()); + } else if (unsigned new_pg_num = i.second->get_pg_num(j.first); + new_pg_num != j.second.get_pg_num()) { + dout(10) << __func__ << " recording pool " << j.first << " pg_num " + << j.second.get_pg_num() << " -> " << new_pg_num << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num); + } + } + for (auto& j : i.second->get_pools()) { + if (!lastmap->have_pg_pool(j.first)) { + dout(10) << __func__ << " recording new pool " << j.first << " pg_num " + << j.second.get_pg_num() << dendl; + pg_num_history.log_pg_num_change(i.first, j.first, + j.second.get_pg_num()); } } lastmap = i.second; } + pg_num_history.epoch = last; + { + bufferlist bl; + ::encode(pg_num_history, bl); + t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl); + dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl; + } // superblock and commit write_superblock(t); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 4b766f56ad241..596b77d08dc41 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1342,6 +1342,10 @@ public: CEPH_NOSNAP))); } + static ghobject_t make_pg_num_history_oid() { + return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP))); + } + static void recursive_remove_collection(CephContext* cct, ObjectStore *store, spg_t pgid, @@ -1803,6 +1807,8 @@ protected: return osdmap ? osdmap->get_epoch() : 0; } + pool_pg_num_history_t pg_num_history; + utime_t had_map_since; RWLock map_lock; list waiting_for_osdmap; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 637f5cca1c1e9..d1b83591aca6f 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -5454,4 +5454,92 @@ struct store_statfs_t }; ostream &operator<<(ostream &lhs, const store_statfs_t &rhs); + +struct pool_pg_num_history_t { + /// last epoch updated + epoch_t epoch = 0; + /// poolid -> epoch -> pg_num + map> pg_nums; + /// pair(epoch, poolid) + set> deleted_pools; + + void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) { + pg_nums[pool][epoch] = pg_num; + } + void log_pool_delete(epoch_t epoch, int64_t pool) { + deleted_pools.insert(make_pair(epoch, pool)); + } + + /// prune history based on oldest osdmap epoch in the cluster + void prune(epoch_t oldest_epoch) { + auto i = deleted_pools.begin(); + while (i != deleted_pools.end()) { + if (i->first >= oldest_epoch) { + break; + } + pg_nums.erase(i->second); + i = deleted_pools.erase(i); + } + for (auto& j : pg_nums) { + auto k = j.second.lower_bound(oldest_epoch); + // keep this and the entry before it (just to be paranoid) + if (k != j.second.begin()) { + --k; + j.second.erase(j.second.begin(), k); + } + } + } + + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(epoch, bl); + encode(pg_nums, bl); + encode(deleted_pools, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(epoch, p); + decode(pg_nums, p); + decode(deleted_pools, p); + DECODE_FINISH(p); + } + void dump(Formatter *f) const { + f->dump_unsigned("epoch", epoch); + f->open_object_section("pools"); + for (auto& i : pg_nums) { + f->open_object_section("pool"); + f->dump_unsigned("pool_id", i.first); + f->open_array_section("changes"); + for (auto& j : i.second) { + f->open_object_section("change"); + f->dump_unsigned("epoch", j.first); + f->dump_unsigned("pg_num", j.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("deleted_pools"); + for (auto& i : deleted_pools) { + f->open_object_section("deletion"); + f->dump_unsigned("pool_id", i.second); + f->dump_unsigned("epoch", i.first); + f->close_section(); + } + f->close_section(); + } + static void generate_test_instances(list& ls) { + ls.push_back(new pool_pg_num_history_t); + } + friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) { + return out << "pg_num_history(e" << h.epoch + << " pg_nums " << h.pg_nums + << " deleted_pools " << h.deleted_pools + << ")"; + } +}; +WRITE_CLASS_ENCODER(pool_pg_num_history_t) + #endif diff --git a/src/tools/ceph-dencoder/types.h b/src/tools/ceph-dencoder/types.h index 255c23f00d6d9..b3618c458287a 100644 --- a/src/tools/ceph-dencoder/types.h +++ b/src/tools/ceph-dencoder/types.h @@ -109,6 +109,7 @@ TYPE(ScrubMap) TYPE_FEATUREFUL(obj_list_watch_response_t) TYPE(clone_info) TYPE(obj_list_snap_response_t) +TYPE(pool_pg_num_history_t) #include "osd/ECUtil.h" // TYPE(stripe_info_t) non-standard encoding/decoding functions -- 2.39.5