dout(10) << *pg
<< " up " << oldup << " -> " << pg->up
- << " acting " << oldacting << " -> " << pg->acting
+ << ", acting " << oldacting << " -> " << pg->acting
<< ", role " << oldrole << " -> " << role << dendl;
// pg->on_*
clear_map_cache(); // we're done with it
update_heartbeat_peers();
+
+ send_pg_temp();
}
creating_pgs[pgid].parent = parent;
creating_pgs[pgid].split_bits = split_bits;
creating_pgs[pgid].acting.swap(acting);
- calc_priors_during(pgid, created, history.same_primary_since,
+ calc_priors_during(pgid, created, history.same_acting_since,
creating_pgs[pgid].prior);
// poll priors
goto out;
}
- assert(!pg->is_active());
-
if (!pg->build_backlog_map(omap))
goto out;
dout(10) << *pg << " generate_backlog aborting" << dendl;
goto out2;
}
- assert(!pg->is_active());
if (!pg->is_primary()) {
dout(10) << *pg << " sending info+missing+backlog to primary" << dendl;
+ assert(!pg->is_active()); // for now
MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->info);
m->missing = pg->missing;
m->log = pg->log;
Log::Entry e;
e.soid = poid;
bufferlist bv;
- osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv);
+ int r = osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv);
+ if (r < 0)
+ continue; // musta just been deleted!
object_info_t oi(bv);
e.version = oi.version;
e.prior_version = oi.prior_version;
assert(!log.backlog);
log.backlog = true;
+ info.log_backlog = true;
/*
* note that we don't create prior_version backlog entries for
assert(log.backlog);
log.backlog = false;
+ info.log_backlog = false;
while (!log.log.empty()) {
Log::Entry &e = *log.log.begin();
clear_prior();
- // current nodes, of course.
- for (unsigned i=1; i<up.size(); i++)
- prior_set.insert(up[i]);
+ // current up and/or acting nodes, of course.
+ for (unsigned i=0; i<up.size(); i++)
+ if (up[i] != osd->whoami)
+ prior_set.insert(up[i]);
+ for (unsigned i=0; i<acting.size(); i++)
+ if (acting[i] != osd->whoami)
+ prior_set.insert(acting[i]);
// and prior PG mappings. move backwards in time.
state_clear(PG_STATE_CRASHED);
osd->recovery_wq.dequeue(this);
}
+bool PG::choose_acting(int newest_update_osd)
+{
+ vector<int> want = up;
+
+ Info& newest = (newest_update_osd == osd->whoami) ? info : peer_info[newest_update_osd];
+ Info& oprimi = (want[0] == osd->whoami) ? info : peer_info[want[0]];
+ if (newest_update_osd != want[0] &&
+ oprimi.last_update < newest.log_tail && !newest.log_backlog) {
+ // up[0] needs a backlog to catch up
+ // make newest_update_osd primary instead?
+ for (unsigned i=1; i<want.size(); i++)
+ if (want[i] == newest_update_osd) {
+ dout(10) << "choose_acting up[0] osd" << want[0] << " needs backlog to catch up, making "
+ << want[i] << " primary" << dendl;
+ want[0] = want[i];
+ want[i] = up[0];
+ break;
+ }
+ }
+ // exclude peers who need backlogs to catch up?
+ Info& primi = (want[0] == osd->whoami) ? info : peer_info[want[0]];
+ for (vector<int>::iterator p = want.begin() + 1; p != want.end(); ) {
+ Info& pi = (*p == osd->whoami) ? info : peer_info[*p];
+ if (pi.last_update < primi.log_tail && !primi.log_backlog) {
+ dout(10) << "choose_acting osd" << *p << " needs primary backlog to catch up" << dendl;
+ want.erase(p);
+ } else {
+ dout(10) << "choose_acting osd" << *p << " can catch up with osd" << want[0] << " log" << dendl;
+ p++;
+ }
+ }
+ if (want != acting) {
+ dout(10) << "choose_acting want " << want << " != acting " << acting
+ << ", requesting pg_temp change" << dendl;
+ if (want == up) {
+ vector<int> empty;
+ osd->queue_want_pg_temp(info.pgid, empty);
+ } else
+ osd->queue_want_pg_temp(info.pgid, want);
+ return false;
+ }
+ dout(10) << "choose_acting want " << want << " (== acting)" << dendl;
+ return true;
+}
// if false, stop.
bool PG::recover_master_log(map< int, map<pg_t,Query> >& query_map)
{
+ dout(10) << "recover_master_log" << dendl;
+
// -- query info from everyone in prior_set.
bool missing_info = false;
for (set<int>::iterator it = prior_set.begin();
newest_update = it->second.last_update;
newest_update_osd = it->first;
}
- if (is_up(it->first)) {
+ if (is_acting(it->first)) {
if (it->second.last_update < oldest_update) {
oldest_update = it->second.last_update;
oldest_who = it->first;
}
+ }
+ if (is_up(it->first)) {
if (it->second.last_complete < min_last_complete_ondisk)
min_last_complete_ondisk = it->second.last_complete;
}
newest_update_osd = osd->whoami;
// -- decide what acting set i want, based on state of up set
- vector<int> want = up;
- if (newest_update_osd != osd->whoami &&
- log.head < peer_info[newest_update_osd].log_tail) {
- // up[0] needs a backlog to catch up
- // make newest_update_osd primary instead?
- for (unsigned i=1; i<want.size(); i++)
- if (want[i] == newest_update_osd) {
- dout(10) << " osd" << want[0] << " needs backlog to catch up, making " << want[i] << " primary" << dendl;
- want[0] = want[i];
- want[i] = up[0];
- break;
- }
- }
- // exclude peers who need backlogs to catch up?
- Info& primi = (want[0] == osd->whoami) ? info : peer_info[want[0]];
- for (vector<int>::iterator p = want.begin() + 1; p != want.end(); ) {
- Info& pi = (*p == osd->whoami) ? info : peer_info[*p];
- if (pi.last_update < primi.log_tail) {
- dout(10) << " osd" << *p << " needs primary's backlog to catch up" << dendl;
- want.erase(p);
- } else {
- dout(10) << " osd" << *p << " can catch up with osd" << want[0] << " log" << dendl;
- p++;
- }
- }
- if (want != acting) {
- dout(10) << " want up " << want << " != acting " << acting << ", requesting pg_temp change" << dendl;
- if (want == up) {
- vector<int> empty;
- osd->queue_want_pg_temp(info.pgid, empty);
- } else
- osd->queue_want_pg_temp(info.pgid, want);
+ if (!choose_acting(newest_update_osd))
return false;
- }
- dout(10) << " want " << want << " (== acting)" << dendl;
// gather log(+missing) from that person!
if (newest_update_osd != osd->whoami) {
map< int, map<pg_t,Query> >& query_map,
map<int, MOSDPGInfo*> *activator_map)
{
- dout(10) << "peer acting is " << acting << dendl;
+ dout(10) << "peer up " << up << ", acting " << acting << dendl;
if (!is_active())
state_set(PG_STATE_PEERING);
dout(10) << "peer prior_set is " << prior_set << dendl;
- if (!have_master_log)
+ if (!have_master_log) {
if (!recover_master_log(query_map))
return;
+ } else if (up != acting) {
+ // are we done generating backlog(s)?
+ if (!choose_acting(osd->whoami))
+ return;
+ }
// -- do i need to generate backlog for any of my peers?
clear_prior();
// if we are building a backlog, cancel it!
- osd->cancel_generate_backlog(this);
+ if (up == acting)
+ osd->cancel_generate_backlog(this);
// write pg info, log
write_info(t);
::encode(last_epoch_started, bl);
::encode(same_acting_since, bl);
::encode(same_up_since, bl);
- //::encode(same_primary_since, bl);
+ ::encode(same_primary_since, bl);
}
- void decode(bufferlist::iterator &bl) {
+ void decode(bufferlist::iterator &bl, version_t v) {
::decode(epoch_created, bl);
::decode(last_epoch_started, bl);
::decode(same_acting_since, bl);
- ::decode(same_up_since, bl);
- //::decode(same_primary_since, bl);
+ if (v >= 20)
+ ::decode(same_up_since, bl);
+ else
+ same_up_since = same_acting_since;
+ ::decode(same_primary_since, bl);
}
} history;
::decode(log_tail, bl);
::decode(log_backlog, bl);
::decode(stats, bl);
- history.decode(bl);
+ history.decode(bl, v);
::decode(snap_trimq, bl);
}
};
- WRITE_CLASS_ENCODER(Info::History)
+ //WRITE_CLASS_ENCODER(Info::History)
WRITE_CLASS_ENCODER(Info)
void decode(bufferlist::iterator &bl) {
::decode(type, bl);
::decode(since, bl);
- history.decode(bl);
+ history.decode(bl, ~0ull);
}
};
WRITE_CLASS_ENCODER(Query)
public:
vector<int> up, acting;
map<int,eversion_t> peer_last_complete_ondisk;
- eversion_t min_last_complete_ondisk; // min over last_complete_ondisk, peer_last_complete_ondisk
+ eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
eversion_t pg_trim_to;
// [primary only] content recovery state
bool need_up_thru;
set<int> stray_set; // non-acting osds that have PG data.
set<int> uptodate_set; // current OSDs that are uptodate
- eversion_t oldest_update; // lowest (valid) last_update in active set
+ eversion_t oldest_update; // acting: lowest (valid) last_update in active set
map<int,Info> peer_info; // info from peers (stray or prior)
set<int> peer_info_requested;
map<int, Missing> peer_missing;
bool is_prior(int osd) const { return prior_set.count(osd); }
bool is_stray(int osd) const { return stray_set.count(osd); }
- bool is_all_uptodate() const { return uptodate_set.size() == acting.size(); }
+ bool is_all_uptodate() const { return uptodate_set.size() == acting.size() && up == acting; }
void generate_past_intervals();
void trim_past_intervals();
void trim_write_ahead();
+ bool choose_acting(int newest_update_osd);
bool recover_master_log(map< int, map<pg_t,Query> >& query_map);
void peer(ObjectStore::Transaction& t,
map< int, map<pg_t,Query> >& query_map,
virtual void on_shutdown() = 0;
};
-WRITE_CLASS_ENCODER(PG::Info::History)
+//WRITE_CLASS_ENCODER(PG::Info::History)
WRITE_CLASS_ENCODER(PG::Info)
WRITE_CLASS_ENCODER(PG::Query)
WRITE_CLASS_ENCODER(PG::Missing::item)
inline ostream& operator<<(ostream& out, const PG& pg)
{
- out << "pg[" << pg.info
- << " " << pg.acting;
- out << " r=" << pg.get_role();
+ out << "pg[" << pg.info
+ << " " << pg.up;
if (pg.acting != pg.up)
- out << " up=" << pg.up;
+ out << "/" << pg.acting;
+ out << " r=" << pg.get_role();
if (pg.recovery_ops_active)
out << " rops=" << pg.recovery_ops_active;