class MOSDPGCreate2 : public Message {
public:
- static constexpr int HEAD_VERSION = 1;
+ static constexpr int HEAD_VERSION = 2;
static constexpr int COMPAT_VERSION = 1;
epoch_t epoch = 0;
map<spg_t,pair<epoch_t,utime_t>> pgs;
+ map<spg_t,pair<pg_history_t,PastIntervals>> pg_extra;
MOSDPGCreate2()
: Message{MSG_OSD_PG_CREATE2, HEAD_VERSION, COMPAT_VERSION} {}
using ceph::encode;
encode(epoch, payload);
encode(pgs, payload);
+ encode(pg_extra, payload);
}
void decode_payload() override {
auto p = payload.cbegin();
using ceph::decode;
decode(epoch, p);
decode(pgs, p);
+ if (header.version >= 2) {
+ decode(pg_extra, p);
+ }
}
private:
template<class T, typename... Args>
struct creating_pgs_t {
epoch_t last_scan_epoch = 0;
+ struct pg_create_info {
+ epoch_t create_epoch;
+ utime_t create_stamp;
+
+ // NOTE: pre-octopus instances of this class will have a
+ // zeroed-out history
+ vector<int> up;
+ int up_primary = -1;
+ vector<int> acting;
+ int acting_primary = -1;
+ pg_history_t history;
+ PastIntervals past_intervals;
+
+ void encode(bufferlist& bl, uint64_t features) const {
+ using ceph::encode;
+ if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+ // was pair<epoch_t,utime_t> prior to octopus
+ encode(create_epoch, bl);
+ encode(create_stamp, bl);
+ return;
+ }
+ ENCODE_START(1, 1, bl);
+ encode(create_epoch, bl);
+ encode(create_stamp, bl);
+ encode(up, bl);
+ encode(up_primary, bl);
+ encode(acting, bl);
+ encode(acting_primary, bl);
+ encode(history, bl);
+ encode(past_intervals, bl);
+ ENCODE_FINISH(bl);
+ }
+ void decode_legacy(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ decode(create_epoch, p);
+ decode(create_stamp, p);
+ }
+ void decode(bufferlist::const_iterator& p) {
+ using ceph::decode;
+ DECODE_START(1, p);
+ decode(create_epoch, p);
+ decode(create_stamp, p);
+ decode(up, p);
+ decode(up_primary, p);
+ decode(acting, p);
+ decode(acting_primary, p);
+ decode(history, p);
+ decode(past_intervals, p);
+ DECODE_FINISH(p);
+ }
+ void dump(Formatter *f) const {
+ f->dump_unsigned("create_epoch", create_epoch);
+ f->dump_stream("create_stamp") << create_stamp;
+ f->open_array_section("up");
+ for (auto& i : up) {
+ f->dump_unsigned("osd", i);
+ }
+ f->close_section();
+ f->dump_int("up_primary", up_primary);
+ f->open_array_section("acting");
+ for (auto& i : acting) {
+ f->dump_unsigned("osd", i);
+ }
+ f->close_section();
+ f->dump_int("acting_primary", up_primary);
+ f->dump_object("pg_history", history);
+ f->dump_object("past_intervals", past_intervals);
+ }
+
+ pg_create_info() {}
+ pg_create_info(epoch_t e, utime_t t)
+ : create_epoch(e),
+ create_stamp(t) {
+ // NOTE: we don't initialize the other fields here; see
+ // OSDMonitor::update_pending_pgs()
+ }
+ };
+
/// pgs we are currently creating
- std::map<pg_t, std::pair<epoch_t, utime_t> > pgs;
+ std::map<pg_t, pg_create_info> pgs;
- struct create_info {
+ struct pool_create_info {
epoch_t created;
utime_t modified;
uint64_t start = 0;
};
/// queue of pgs we still need to create (poolid -> <created, set of ps>)
- map<int64_t,create_info> queue;
+ map<int64_t,pool_create_info> queue;
/// pools that exist in the osdmap for which at least one pg has been created
std::set<int64_t> created_pools;
queue.erase(removed_pool);
return total - pgs.size();
}
- void encode(bufferlist& bl) const {
- ENCODE_START(2, 1, bl);
+ void encode(bufferlist& bl, uint64_t features) const {
+ unsigned v = 3;
+ if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+ v = 2;
+ }
+ ENCODE_START(v, 1, bl);
encode(last_scan_epoch, bl);
- encode(pgs, bl);
+ encode(pgs, bl, features);
encode(created_pools, bl);
encode(queue, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::const_iterator& bl) {
- DECODE_START(2, bl);
+ DECODE_START(3, bl);
decode(last_scan_epoch, bl);
- decode(pgs, bl);
+ if (struct_v >= 3) {
+ decode(pgs, bl);
+ } else {
+ // legacy pg encoding
+ pgs.clear();
+ uint32_t num;
+ decode(num, bl);
+ while (num--) {
+ pg_t pgid;
+ decode(pgid, bl);
+ pgs[pgid].decode_legacy(bl);
+ }
+ }
decode(created_pools, bl);
if (struct_v >= 2)
decode(queue, bl);
for (auto& pg : pgs) {
f->open_object_section("pg");
f->dump_stream("pgid") << pg.first;
- f->dump_unsigned("epoch", pg.second.first);
- f->dump_stream("ctime") << pg.second.second;
+ f->dump_object("pg_create_info", pg.second);
f->close_section();
}
f->close_section();
static void generate_test_instances(list<creating_pgs_t*>& o) {
auto c = new creating_pgs_t;
c->last_scan_epoch = 17;
- c->pgs.emplace(pg_t{42, 2}, make_pair(31, utime_t{891, 113}));
- c->pgs.emplace(pg_t{44, 2}, make_pair(31, utime_t{891, 113}));
+ c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113}));
+ c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113}));
c->created_pools = {0, 1};
o.push_back(c);
c = new creating_pgs_t;
c->last_scan_epoch = 18;
- c->pgs.emplace(pg_t{42, 3}, make_pair(31, utime_t{891, 113}));
+ c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113}));
c->created_pools = {};
o.push_back(c);
}
};
-WRITE_CLASS_ENCODER(creating_pgs_t::create_info);
-WRITE_CLASS_ENCODER(creating_pgs_t);
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info)
+WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info)
+WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t)
const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
// NOTE: use the *current* epoch as the PG creation epoch so that the
// OSD does not have to generate a long set of PastIntervals.
- pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
- p->second.modified));
+ pending_creatings.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(inc.epoch,
+ p->second.modified));
dout(10) << __func__ << " adding " << pgid << dendl;
}
p->second.start = end;
}
dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
<< " pools" << dendl;
+
+ if (mon->monmap->min_mon_release >= CEPH_RELEASE_OCTOPUS) {
+ // walk creating pgs' history and past_intervals forward
+ for (auto& i : pending_creatings.pgs) {
+ // this mirrors PG::start_peering_interval()
+ pg_t pgid = i.first;
+
+ // this is a bit imprecise, but sufficient?
+ struct min_size_predicate_t : public IsPGRecoverablePredicate {
+ const pg_pool_t *pi;
+ bool operator()(const set<pg_shard_t> &have) const {
+ return have.size() >= pi->min_size;
+ }
+ explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+ } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
+
+ vector<int> up, acting;
+ int up_primary, acting_primary;
+ nextmap.pg_to_up_acting_osds(
+ pgid, &up, &up_primary, &acting, &acting_primary);
+ if (i.second.history.epoch_created == 0) {
+ // new pg entry, set it up
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ i.second.history = pg_history_t(i.second.create_epoch,
+ i.second.create_stamp);
+ dout(10) << __func__ << " pg " << pgid << " just added, "
+ << " up " << i.second.up
+ << " p " << i.second.up_primary
+ << " acting " << i.second.acting
+ << " p " << i.second.acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ } else {
+ std::stringstream debug;
+ if (PastIntervals::check_new_interval(
+ i.second.acting_primary, acting_primary,
+ i.second.acting, acting,
+ i.second.up_primary, up_primary,
+ i.second.up, up,
+ i.second.history.same_interval_since,
+ i.second.history.last_epoch_clean,
+ &nextmap,
+ &osdmap,
+ pgid,
+ &min_size_predicate,
+ &i.second.past_intervals,
+ &debug)) {
+ epoch_t e = inc.epoch;
+ i.second.history.same_interval_since = e;
+ if (i.second.up != up) {
+ i.second.history.same_up_since = e;
+ }
+ if (i.second.acting_primary != acting_primary) {
+ i.second.history.same_primary_since = e;
+ }
+ if (pgid.is_split(
+ osdmap.get_pg_num(pgid.pool()),
+ nextmap.get_pg_num(pgid.pool()),
+ nullptr)) {
+ i.second.history.last_epoch_split = e;
+ }
+ dout(10) << __func__ << " pg " << pgid << " new interval,"
+ << " up " << i.second.up << " -> " << up
+ << " p " << i.second.up_primary << " -> " << up_primary
+ << " acting " << i.second.acting << " -> " << acting
+ << " p " << i.second.acting_primary << " -> "
+ << acting_primary
+ << " history " << i.second.history
+ << " past_intervals " << i.second.past_intervals
+ << dendl;
+ dout(20) << " debug: " << debug.str() << dendl;
+ i.second.up = up;
+ i.second.acting = acting;
+ i.second.up_primary = up_primary;
+ i.second.acting_primary = acting_primary;
+ }
+ }
+ }
+ }
dout(10) << __func__
<< " " << (pending_creatings.pgs.size() - total)
<< "/" << pending_creatings.pgs.size()
// process the pool flag removal below in the same osdmap epoch.
auto pending_creatings = update_pending_pgs(pending_inc, tmp);
bufferlist creatings_bl;
- encode(pending_creatings, creatings_bl);
+ uint64_t features = CEPH_FEATURES_ALL;
+ if (mon->monmap->min_mon_release < CEPH_RELEASE_OCTOPUS) {
+ dout(20) << __func__ << " encoding pending pgs without octopus features"
+ << dendl;
+ features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+ }
+ encode(pending_creatings, creatings_bl, features);
t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
// remove any old (or incompat) POOL_CREATING flags
<< dendl;
continue;
}
- auto mapped = pg.second.first;
+ auto mapped = pg.second.create_epoch;
dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
spg_t spgid(pgid);
mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
oldm = new MOSDPGCreate(creating_pgs_epoch);
}
oldm->mkpg.emplace(pg.pgid,
- pg_create_t{create->second.first, pg.pgid, 0});
- oldm->ctimes.emplace(pg.pgid, create->second.second);
+ pg_create_t{create->second.create_epoch, pg.pgid, 0});
+ oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
} else {
if (!m) {
m = new MOSDPGCreate2(creating_pgs_epoch);
}
- m->pgs.emplace(pg, create->second);
+ m->pgs.emplace(pg, make_pair(create->second.create_epoch,
+ create->second.create_stamp));
+ if (create->second.history.epoch_created) {
+ dout(20) << __func__ << " " << pg << " " << create->second.history
+ << " " << create->second.past_intervals << dendl;
+ m->pg_extra.emplace(pg, make_pair(create->second.history,
+ create->second.past_intervals));
+ }
}
dout(20) << __func__ << " will create " << pg
- << " at " << create->second.first << dendl;
+ << " at " << create->second.create_epoch << dendl;
}
}
if (m) {
bool creating_now;
{
std::lock_guard<std::mutex> l(creating_pgs_lock);
- auto emplaced = creating_pgs.pgs.emplace(pgid,
- make_pair(osdmap.get_epoch(),
- ceph_clock_now()));
+ auto emplaced = creating_pgs.pgs.emplace(
+ pgid,
+ creating_pgs_t::pg_create_info(osdmap.get_epoch(),
+ ceph_clock_now()));
creating_now = emplaced.second;
}
if (creating_now) {
TYPE(ScrubResult)
#include "mon/CreatingPGs.h"
-TYPE(creating_pgs_t)
+TYPE_FEATUREFUL(creating_pgs_t)
#include "mgr/ServiceMap.h"
TYPE_FEATUREFUL(ServiceMap)
creating.last_scan_epoch = last_osdmap_epoch;
bufferlist newbl;
- ::encode(creating, newbl);
+ encode(creating, newbl, CEPH_FEATURES_ALL);
auto t = make_shared<MonitorDBStore::Transaction>();
t->put("osd_pg_creating", "creating", newbl);