We use pg_stat_t information to determine pg create targeting.
Fixes: #7481
Signed-off-by: Samuel Just <sam.just@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
void PGMap::dump_pg_stats_plain(ostream& ss,
const ceph::unordered_map<pg_t, pg_stat_t>& pg_stats) const
{
- ss << "pg_stat\tobjects\tmip\tdegr\tunf\tbytes\tlog\tdisklog\tstate\tstate_stamp\tv\treported\tup\tacting\tlast_scrub\tscrub_stamp\tlast_deep_scrub\tdeep_scrub_stamp" << std::endl;
+ ss << "pg_stat\tobjects\tmip\tdegr\tunf\tbytes\tlog\tdisklog\tstate\tstate_stamp\tv\treported\tup\tup_primary\tacting\tacting_primary\tlast_scrub\tscrub_stamp\tlast_deep_scrub\tdeep_scrub_stamp" << std::endl;
for (ceph::unordered_map<pg_t, pg_stat_t>::const_iterator i = pg_stats.begin();
i != pg_stats.end(); ++i) {
const pg_stat_t &st(i->second);
<< "\t" << st.version
<< "\t" << st.reported_epoch << ":" << st.reported_seq
<< "\t" << st.up
+ << "\t" << st.up_primary
<< "\t" << st.acting
+ << "\t" << st.acting_primary
<< "\t" << st.last_scrub << "\t" << st.last_scrub_stamp
<< "\t" << st.last_deep_scrub << "\t" << st.last_deep_scrub_stamp
<< std::endl;
pg_stat_t& s = pg_map.pg_stat[pgid];
if (s.parent_split_bits)
on = s.parent;
- vector<int> acting;
- int nrep = osdmap->pg_to_acting_osds(on, acting);
- if (s.acting.size()) {
- pg_map.creating_pgs_by_osd[s.acting[0]].erase(pgid);
- if (pg_map.creating_pgs_by_osd[s.acting[0]].size() == 0)
- pg_map.creating_pgs_by_osd.erase(s.acting[0]);
+ vector<int> up, acting;
+ int up_primary, acting_primary;
+ osdmap->pg_to_up_acting_osds(
+ on,
+ &up,
+ &up_primary,
+ &acting,
+ &acting_primary);
+
+ if (s.acting_primary != -1) {
+ pg_map.creating_pgs_by_osd[s.acting_primary].erase(pgid);
+ if (pg_map.creating_pgs_by_osd[s.acting_primary].size() == 0)
+ pg_map.creating_pgs_by_osd.erase(s.acting_primary);
}
+ s.up = up;
+ s.up_primary = up_primary;
s.acting = acting;
+ s.acting_primary = acting_primary;
// don't send creates for localized pgs
if (pgid.preferred() >= 0)
if (s.parent_split_bits)
continue;
- if (nrep) {
- pg_map.creating_pgs_by_osd[acting[0]].insert(pgid);
+ if (acting_primary != -1) {
+ pg_map.creating_pgs_by_osd[acting_primary].insert(pgid);
} else {
dout(20) << "map_pg_creates " << pgid << " -> no osds in epoch "
<< mon->osdmon()->osdmap.get_epoch() << ", skipping" << dendl;
p != pg_map.pg_stat.end();
++p) {
if ((p->second.state & PG_STATE_STALE) == 0 &&
- p->second.acting.size() &&
- osdmap->is_down(p->second.acting[0])) {
+ p->second.acting_primary != -1 &&
+ osdmap->is_down(p->second.acting_primary)) {
dout(10) << " marking pg " << p->first << " stale with acting " << p->second.acting << dendl;
map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(p->first);
r = -ENOENT;
goto reply;
}
- if (!pg_map.pg_stat[pgid].acting.size()) {
+ if (pg_map.pg_stat[pgid].acting_primary != -1) {
ss << "pg " << pgid << " has no primary osd";
r = -EAGAIN;
goto reply;
}
- int osd = pg_map.pg_stat[pgid].acting[0];
+ int osd = pg_map.pg_stat[pgid].acting_primary;
if (!mon->osdmon()->osdmap.is_up(osd)) {
ss << "pg " << pgid << " primary osd." << osd << " not up";
r = -EAGAIN;
past_intervals.swap(pi);
info.stats.up = up;
+ info.stats.up_primary = new_up_primary;
info.stats.acting = acting;
+ info.stats.acting_primary = new_acting_primary;
info.stats.mapping_epoch = info.history.same_interval_since;
if (backfill) {
new_acting_primary);
if (info.stats.up != up ||
- info.stats.acting != acting) {
+ info.stats.acting != acting ||
+ info.stats.up_primary != new_up_primary ||
+ info.stats.acting_primary != new_acting_primary) {
info.stats.up = up;
+ info.stats.up_primary = new_up_primary;
info.stats.acting = acting;
+ info.stats.acting_primary = new_acting_primary;
info.stats.mapping_epoch = info.history.same_interval_since;
}
f->open_array_section("acting");
for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
f->dump_int("osd", *p);
+ f->dump_int("up_primary", up_primary);
+ f->dump_int("acting_primary", acting_primary);
f->close_section();
}
f->open_array_section("acting");
for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
f->dump_int("osd", *p);
+ f->dump_int("up_primary", up_primary);
+ f->dump_int("acting_primary", acting_primary);
f->close_section();
}
void pg_stat_t::encode(bufferlist &bl) const
{
- ENCODE_START(14, 8, bl);
+ ENCODE_START(15, 8, bl);
::encode(version, bl);
::encode(reported_seq, bl);
::encode(reported_epoch, bl);
::encode(last_clean_scrub_stamp, bl);
::encode(last_became_active, bl);
::encode(dirty_stats_invalid, bl);
+ ::encode(up_primary, bl);
+ ::encode(acting_primary, bl);
ENCODE_FINISH(bl);
}
void pg_stat_t::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(14, 8, 8, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(15, 8, 8, bl);
::decode(version, bl);
::decode(reported_seq, bl);
::decode(reported_epoch, bl);
// encoder may not have supported num_objects_dirty accounting.
dirty_stats_invalid = true;
}
+ if (struct_v >= 15) {
+ ::decode(up_primary, bl);
+ ::decode(acting_primary, bl);
+ } else {
+ up_primary = up.size() ? up[0] : -1;
+ acting_primary = acting.size() ? acting[0] : -1;
+ }
DECODE_FINISH(bl);
}
a.log_size = 99;
a.ondisk_log_size = 88;
a.up.push_back(123);
+ a.up_primary = 123;
a.acting.push_back(456);
+ a.acting_primary = 456;
+ o.push_back(new pg_stat_t(a));
+
+ a.up.push_back(124);
+ a.up_primary = 124;
+ a.acting.push_back(124);
+ a.acting_primary = 124;
o.push_back(new pg_stat_t(a));
}
/// maintained starting from pool creation)
bool dirty_stats_invalid;
+ /// up, acting primaries
+ int up_primary;
+ int acting_primary;
+
pg_stat_t()
: reported_seq(0),
reported_epoch(0),
stats_invalid(false),
log_size(0), ondisk_log_size(0),
mapping_epoch(0),
- dirty_stats_invalid(false)
+ dirty_stats_invalid(false),
+ up_primary(-1),
+ acting_primary(-1)
{ }
epoch_t get_effective_last_epoch_clean() const {