This will simplify our identification of split and merge events.
Signed-off-by: Sage Weil <sage@redhat.com>
ceph_assert(osd_lock.is_locked());
dout(0) << "load_pgs" << dendl;
+ {
+ auto pghist = make_pg_num_history_oid();
+ bufferlist bl;
+ int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
+ if (r >= 0 && bl.length() > 0) {
+ auto p = bl.cbegin();
+ decode(pg_num_history, p);
+ }
+ dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+ }
+
vector<coll_t> ls;
int r = store->list_collections(ls);
if (r < 0) {
if (superblock.oldest_map) {
// make sure we at least keep pace with incoming maps
trim_maps(m->oldest_map, last - first + 1, skip_maps);
+ pg_num_history.prune(superblock.oldest_map);
}
if (!superblock.oldest_map || skip_maps)
superblock.clean_thru = last;
}
- // check for deleted pools
+ // check for pg_num changes and deleted pools
OSDMapRef lastmap;
for (auto& i : added_maps) {
if (!lastmap) {
ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
for (auto& j : lastmap->get_pools()) {
if (!i.second->have_pg_pool(j.first)) {
+ pg_num_history.log_pool_delete(i.first, j.first);
dout(10) << __func__ << " recording final pg_pool_t for pool "
<< j.first << dendl;
// this information is needed by _make_pg() if have to restart before
encode(profile, bl);
t.write(coll_t::meta(), obj, 0, bl.length(), bl);
service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
+ } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
+ new_pg_num != j.second.get_pg_num()) {
+ dout(10) << __func__ << " recording pool " << j.first << " pg_num "
+ << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
+ pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
+ }
+ }
+ for (auto& j : i.second->get_pools()) {
+ if (!lastmap->have_pg_pool(j.first)) {
+ dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
+ << j.second.get_pg_num() << dendl;
+ pg_num_history.log_pg_num_change(i.first, j.first,
+ j.second.get_pg_num());
}
}
lastmap = i.second;
}
+ pg_num_history.epoch = last;
+ {
+ bufferlist bl;
+ ::encode(pg_num_history, bl);
+ t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
+ dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+ }
// superblock and commit
write_superblock(t);
CEPH_NOSNAP)));
}
+ static ghobject_t make_pg_num_history_oid() {
+ return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP)));
+ }
+
static void recursive_remove_collection(CephContext* cct,
ObjectStore *store,
spg_t pgid,
return osdmap ? osdmap->get_epoch() : 0;
}
+ pool_pg_num_history_t pg_num_history;
+
utime_t had_map_since;
RWLock map_lock;
list<OpRequestRef> waiting_for_osdmap;
};
ostream &operator<<(ostream &lhs, const store_statfs_t &rhs);
+
+struct pool_pg_num_history_t {
+ /// last epoch updated
+ epoch_t epoch = 0;
+ /// poolid -> epoch -> pg_num
+ map<int64_t,map<epoch_t,uint32_t>> pg_nums;
+ /// pair(epoch, poolid)
+ set<pair<epoch_t,int64_t>> deleted_pools;
+
+ void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
+ pg_nums[pool][epoch] = pg_num;
+ }
+ void log_pool_delete(epoch_t epoch, int64_t pool) {
+ deleted_pools.insert(make_pair(epoch, pool));
+ }
+
+ /// prune history based on oldest osdmap epoch in the cluster
+ void prune(epoch_t oldest_epoch) {
+ auto i = deleted_pools.begin();
+ while (i != deleted_pools.end()) {
+ if (i->first >= oldest_epoch) {
+ break;
+ }
+ pg_nums.erase(i->second);
+ i = deleted_pools.erase(i);
+ }
+ for (auto& j : pg_nums) {
+ auto k = j.second.lower_bound(oldest_epoch);
+ // keep this and the entry before it (just to be paranoid)
+ if (k != j.second.begin()) {
+ --k;
+ j.second.erase(j.second.begin(), k);
+ }
+ }
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ encode(epoch, bl);
+ encode(pg_nums, bl);
+ encode(deleted_pools, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ DECODE_START(1, p);
+ decode(epoch, p);
+ decode(pg_nums, p);
+ decode(deleted_pools, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("epoch", epoch);
+ f->open_object_section("pools");
+ for (auto& i : pg_nums) {
+ f->open_object_section("pool");
+ f->dump_unsigned("pool_id", i.first);
+ f->open_array_section("changes");
+ for (auto& j : i.second) {
+ f->open_object_section("change");
+ f->dump_unsigned("epoch", j.first);
+ f->dump_unsigned("pg_num", j.second);
+ f->close_section();
+ }
+ f->close_section();
+ f->close_section();
+ }
+ f->close_section();
+ f->open_array_section("deleted_pools");
+ for (auto& i : deleted_pools) {
+ f->open_object_section("deletion");
+ f->dump_unsigned("pool_id", i.second);
+ f->dump_unsigned("epoch", i.first);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ static void generate_test_instances(list<pool_pg_num_history_t*>& ls) {
+ ls.push_back(new pool_pg_num_history_t);
+ }
+ friend ostream& operator<<(ostream& out, const pool_pg_num_history_t& h) {
+ return out << "pg_num_history(e" << h.epoch
+ << " pg_nums " << h.pg_nums
+ << " deleted_pools " << h.deleted_pools
+ << ")";
+ }
+};
+WRITE_CLASS_ENCODER(pool_pg_num_history_t)
+
#endif
TYPE_FEATUREFUL(obj_list_watch_response_t)
TYPE(clone_info)
TYPE(obj_list_snap_response_t)
+TYPE(pool_pg_num_history_t)
#include "osd/ECUtil.h"
// TYPE(stripe_info_t) non-standard encoding/decoding functions