need_up_thru(false),
last_peering_reset(0),
heartbeat_peer_lock("PG::heartbeat_peer_lock"),
- backfill_target(-1),
backfill_reserved(0),
backfill_reserving(0),
flushes_in_progress(0),
ret = true;
}
- vector<int>::const_iterator end = acting.end();
- vector<int>::const_iterator a = acting.begin();
+ assert(actingbackfill.size() > 0);
+ vector<int>::const_iterator end = actingbackfill.end();
+ vector<int>::const_iterator a = actingbackfill.begin();
assert(a != end);
++a;
for (; a != end; ++a) {
bool ret = false;
- vector<int>::const_iterator end = acting.end();
- vector<int>::const_iterator a = acting.begin();
- assert(a != end);
- ++a;
+ // We can assume that only possible osds that need backfill
+ // are on the backfill_targets vector.
+ vector<int>::const_iterator end = backfill_targets.end();
+ vector<int>::const_iterator a = backfill_targets.begin();
for (; a != end; ++a) {
int peer = *a;
map<int,pg_info_t>::const_iterator pi = peer_info.find(peer);
* incomplete, or another osd has a longer tail that allows us to
* bring other up nodes up to date.
*/
-bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want) const
+bool PG::calc_acting(int& newest_update_osd_id, vector<int>& want, vector<int>& backfill) const
{
map<int, pg_info_t> all_info(peer_info.begin(), peer_info.end());
all_info[osd->whoami] = info;
<< " with " << primary->second << dendl;
want.push_back(primary->first);
unsigned usable = 1;
- unsigned backfill = 0;
// select replicas that have log contiguity with primary.
// prefer up, then acting, then any peer_info osds
continue;
const pg_info_t &cur_info = all_info.find(*i)->second;
if (cur_info.is_incomplete() || cur_info.last_update < primary->second.log_tail) {
- if (backfill < 1) {
- dout(10) << " osd." << *i << " (up) accepted (backfill) " << cur_info << dendl;
- want.push_back(*i);
- backfill++;
- } else {
- dout(10) << " osd." << *i << " (up) rejected" << cur_info << dendl;
- }
+ dout(10) << " osd." << *i << " (up) backfill " << cur_info << dendl;
+ backfill.push_back(*i);
} else {
want.push_back(*i);
usable++;
}
}
+ // This no longer has backfill OSDs, but they are covered above.
for (vector<int>::const_iterator i = acting.begin();
i != acting.end();
++i) {
*/
bool PG::choose_acting(int& newest_update_osd)
{
- vector<int> want;
+ vector<int> want, backfill;
- if (!calc_acting(newest_update_osd, want)) {
+ if (!calc_acting(newest_update_osd, want, backfill)) {
dout(10) << "choose_acting failed" << dendl;
assert(want_acting.empty());
return false;
}
- if (want.size() < pool.info.min_size) {
+ // For now we only backfill 1 at a time as before
+ if (!backfill.empty())
+ backfill.resize(1);
+
+ // This might cause a problem if min_size is large
+ // and we need to backfill more than 1 osd. Older
+ // code would only include 1 backfill osd and now we
+ // have the resize above.
+ if (want.size() + backfill.size() < pool.info.min_size) {
want_acting.clear();
return false;
}
dout(10) << "choose_acting want " << want << " != acting " << acting
<< ", requesting pg_temp change" << dendl;
want_acting = want;
+
if (want == up) {
+ // There can't be any pending backfill if
+ // want is the same as crush map up OSDs.
+ assert(backfill.empty());
vector<int> empty;
osd->queue_want_pg_temp(info.pgid, empty);
} else
osd->queue_want_pg_temp(info.pgid, want);
return false;
+ }
+ want_acting.clear();
+ // We can only get here when new interval has arrived and
+ // we've accepted the acting set. Now we can create
+ // actingbackfill and backfill_targets vectors.
+ actingbackfill = acting;
+ actingbackfill.insert(actingbackfill.end(), backfill.begin(), backfill.end());
+ assert(backfill_targets.empty() || backfill_targets == backfill);
+ if (backfill_targets.empty()) {
+ backfill_targets = backfill;
+ for (unsigned i = 0; i < backfill.size() ; ++i) {
+ stray_set.erase(backfill[i]);
+ }
} else {
- want_acting.clear();
+ // Will not change if already set because up would have had to change
+ assert(backfill_targets == backfill);
+ // Verify that nothing in backfill is in stray_set
+ for (unsigned i = 0; i < backfill.size() ; ++i) {
+ assert(stray_set.find(backfill[i]) == stray_set.end());
+ }
}
- dout(10) << "choose_acting want " << want << " (== acting)" << dendl;
+ dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
+ << backfill << dendl;
return true;
}
// count replicas that are not backfilling
unsigned active = 1;
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
assert(peer_info.count(peer));
pg_info_t& pi = peer_info[peer];
}
}
+ assert(active == acting.size());
+
// degraded?
- if (get_osdmap()->get_pg_size(info.pgid) > active)
+ if (get_osdmap()->get_pg_size(info.pgid) > acting.size())
state_set(PG_STATE_DEGRADED);
// all clean?
dout(10) << "_activate_committed " << e << " peer_activated now " << peer_activated
<< " last_epoch_started " << info.history.last_epoch_started
<< " same_interval_since " << info.history.same_interval_since << dendl;
- if (peer_activated.size() == acting.size())
+ assert(actingbackfill.size() > 0);
+ if (peer_activated.size() == actingbackfill.size())
all_activated_and_committed();
} else {
dout(10) << "_activate_committed " << e << " telling primary" << dendl;
{
dout(10) << "all_activated_and_committed" << dendl;
assert(is_primary());
- assert(peer_activated.size() == acting.size());
+ assert(peer_activated.size() == actingbackfill.size());
+ assert(actingbackfill.size() > 0);
// info.last_epoch_started is set during activate()
info.history.last_epoch_started = info.last_epoch_started;
child->snap_trimq = snap_trimq;
+ // There can't be recovery/backfill going on now
get_osdmap()->pg_to_up_acting_osds(child->info.pgid, child->up, child->acting);
child->role = get_osdmap()->calc_pg_role(osd->whoami, child->acting);
if (get_primary() != child->get_primary())
finish_recovery_op(soid, true);
}
- backfill_target = -1;
+ backfill_targets.clear();
backfill_info.clear();
peer_backfill_info.clear();
waiting_on_backfill = false;
pg_stats_publish.stats.add(unstable_stats);
// calc copies, degraded
- unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), acting.size());
+ unsigned target = MAX(get_osdmap()->get_pg_size(info.pgid), actingbackfill.size());
pg_stats_publish.stats.calc_copies(target);
pg_stats_publish.stats.sum.num_objects_degraded = 0;
if ((is_degraded() || !is_clean()) && is_active()) {
uint64_t degraded = 0;
- // if the acting set is smaller than we want, add in those missing replicas
- if (acting.size() < target)
- degraded += (target - acting.size()) * num_objects;
+ // if the actingbackfill set is smaller than we want, add in those missing replicas
+ if (actingbackfill.size() < target)
+ degraded += (target - actingbackfill.size()) * num_objects;
// missing on primary
pg_stats_publish.stats.sum.num_objects_missing_on_primary =
pg_log.get_missing().num_missing();
degraded += pg_log.get_missing().num_missing();
- for (unsigned i=1; i<acting.size(); i++) {
- assert(peer_missing.count(acting[i]));
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ assert(peer_missing.count(actingbackfill[i]));
// in missing set
- degraded += peer_missing[acting[i]].num_missing();
+ degraded += peer_missing[actingbackfill[i]].num_missing();
// not yet backfilled
- degraded += num_objects - peer_info[acting[i]].stats.stats.sum.num_objects;
+ degraded += num_objects - peer_info[actingbackfill[i]].stats.stats.sum.num_objects;
}
pg_stats_publish.stats.sum.num_objects_degraded = degraded;
pg_stats_publish.stats.sum.num_objects_unfound = get_num_unfound();
void PG::trim_peers()
{
+ assert(is_primary());
calc_trim_to();
dout(10) << "trim_peers " << pg_trim_to << dendl;
if (pg_trim_to != eversion_t()) {
- for (unsigned i=1; i<acting.size(); i++)
- osd->send_message_osd_cluster(acting[i],
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++)
+ osd->send_message_osd_cluster(actingbackfill[i],
new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
pg_trim_to),
get_osdmap()->get_epoch());
void PG::scrub_reserve_replicas()
{
+ assert(backfill_targets.empty());
for (unsigned i=1; i<acting.size(); i++) {
dout(10) << "scrub requesting reserve from osd." << acting[i] << dendl;
vector<OSDOp> scrub(1);
void PG::scrub_unreserve_replicas()
{
+ assert(backfill_targets.empty());
for (unsigned i=1; i<acting.size(); i++) {
dout(10) << "scrub requesting unreserve from osd." << acting[i] << dendl;
vector<OSDOp> scrub(1);
if (!scrubber.active) {
OSDMapRef curmap = osd->get_osdmap();
scrubber.is_chunky = true;
+ assert(backfill_targets.empty());
for (unsigned i=1; i<acting.size(); i++) {
ConnectionRef con = osd->get_con_osd_cluster(acting[i], get_osdmap()->get_epoch());
if (!con)
dout(10) << "share_pg_info" << dendl;
// share new pg_info_t with replicas
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
if (peer_info.count(i)) {
peer_info[i].last_epoch_started = info.last_epoch_started;
peer_info[i].history.merge(info.history);
dout(10) << __func__ << dendl;
assert(is_primary());
- vector<int>::const_iterator a = acting.begin();
- assert(a != acting.end());
- vector<int>::const_iterator end = acting.end();
+ vector<int>::const_iterator a = actingbackfill.begin();
+ assert(a != actingbackfill.end());
+ vector<int>::const_iterator end = actingbackfill.end();
while (++a != end) {
int peer(*a);
pg_missing_t& pmissing(peer_missing[peer]);
info.stats.mapping_epoch = info.history.same_interval_since;
}
+ // This will now be remapped during a backfill in cases
+ // that it would not have been before.
if (up != acting)
state_set(PG_STATE_REMAPPED);
else
peer_missing.clear();
peer_purged.clear();
+ actingbackfill.clear();
// reset primary state?
if (oldrole == 0 || get_role() == 0)
PG *pg = context< RecoveryMachine >().pg;
pg->state_set(PG_STATE_BACKFILL_WAIT);
ConnectionRef con = pg->osd->get_con_osd_cluster(
- pg->backfill_target, pg->get_osdmap()->get_epoch());
+ pg->get_backfill_target(), pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
unsigned priority = pg->is_degraded() ? OSDService::BACKFILL_HIGH
// if we finished backfill, all acting are active; recheck if
// DEGRADED is appropriate.
- if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
+ assert(pg->actingbackfill.size() > 0);
+ if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->actingbackfill.size())
pg->state_clear(PG_STATE_DEGRADED);
// adjust acting set? (e.g. because backfill completed...)
* this does not matter) */
if (advmap.lastmap->get_pg_size(pg->info.pgid) !=
pg->get_osdmap()->get_pg_size(pg->info.pgid)) {
- unsigned active = pg->acting.size();
- if (pg->backfill_target != -1)
- --active;
- if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= active)
+ if (pg->get_osdmap()->get_pg_size(pg->info.pgid) <= pg->acting.size())
pg->state_clear(PG_STATE_DEGRADED);
else
pg->state_set(PG_STATE_DEGRADED);
assert(pg->is_active());
assert(pg->is_primary());
+ assert(pg->actingbackfill.size() > 0);
// don't update history (yet) if we are active and primary; the replica
// may be telling us they have activated (and committed) but we can't
// share that until _everyone_ does the same.
- if (pg->is_acting(infoevt.from)) {
+ if (pg->is_actingbackfill(infoevt.from)) {
assert(pg->info.history.last_epoch_started <
pg->info.history.same_interval_since);
assert(infoevt.info.history.last_epoch_started >=
pg->peer_activated.insert(infoevt.from);
}
- if (pg->peer_activated.size() == pg->acting.size()) {
+ if (pg->peer_activated.size() == pg->actingbackfill.size()) {
pg->all_activated_and_committed();
}
return discard_event();
// how much log to request?
eversion_t request_log_from = pg->info.last_update;
- for (vector<int>::iterator p = pg->acting.begin() + 1; p != pg->acting.end(); ++p) {
+ assert(pg->actingbackfill.size() > 0);
+ for (vector<int>::iterator p = pg->actingbackfill.begin() + 1;
+ p != pg->actingbackfill.end(); ++p) {
pg_info_t& ri = pg->peer_info[*p];
if (ri.last_update >= best.log_tail && ri.last_update < request_log_from)
request_log_from = ri.last_update;
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
- for (vector<int>::iterator i = pg->acting.begin() + 1;
- i != pg->acting.end();
+ assert(pg->actingbackfill.size() > 0);
+ for (vector<int>::iterator i = pg->actingbackfill.begin() + 1;
+ i != pg->actingbackfill.end();
++i) {
const pg_info_t& pi = pg->peer_info[*i];
publish_stats_to_osd();
// done!
peer_missing[peer].got(soid, recovery_info.version);
- if (peer == backfill_target && backfills_in_flight.count(soid)) {
+ if (peer == get_backfill_target() && backfills_in_flight.count(soid)) {
map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
assert(i != recovering.end());
list<OpRequestRef> requeue_list;
{
if (pg_log.get_missing().missing.count(soid))
return true;
- for (unsigned i = 1; i < acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i = 1; i < actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
if (peer_missing.count(peer) &&
peer_missing[peer].missing.count(soid))
return true;
// Object is degraded if after last_backfill AND
// we are backfilling it
- if (peer == backfill_target &&
+ if (peer == get_backfill_target() &&
peer_info[peer].last_backfill <= soid &&
last_backfill_started >= soid &&
backfills_in_flight.count(soid))
<< ", recovering"
<< dendl;
eversion_t v;
- for (unsigned i = 1; i < acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i = 1; i < actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
if (peer_missing.count(peer) &&
peer_missing[peer].missing.count(soid)) {
v = peer_missing[peer].missing[soid].need;
for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
f->dump_unsigned("osd", *p);
f->close_section();
+ if (backfill_targets.size() > 0) {
+ f->open_array_section("backfill_targets");
+ for (vector<int>::iterator p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
+ f->dump_unsigned("osd", *p);
+ f->close_section();
+ }
+ if (actingbackfill.size() > 0) {
+ f->open_array_section("actingbackfill");
+ for (vector<int>::iterator p = actingbackfill.begin(); p != actingbackfill.end(); ++p)
+ f->dump_unsigned("osd", *p);
+ f->close_section();
+ }
f->open_object_section("info");
info.dump(f.get());
f->close_section();
// opposite is not a problem; if the target is after the line, we
// don't apply on the backfill_target and it doesn't matter.)
pg_info_t *backfill_target_info = NULL;
+ int backfill_target = get_backfill_target();
bool before_backfill = false;
if (backfill_target >= 0) {
backfill_target_info = &peer_info[backfill_target];
case MOSDPGScan::OP_SCAN_DIGEST:
{
int from = m->get_source().num();
- assert(from == backfill_target);
+ assert(from == get_backfill_target());
BackfillInterval& bi = peer_backfill_info;
bi.begin = m->begin;
bi.end = m->end;
switch (m->op) {
case MOSDPGBackfill::OP_BACKFILL_FINISH:
{
- assert(is_replica());
assert(cct->_conf->osd_kill_backfill_at != 1);
MOSDPGBackfill *reply = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
{
- assert(is_replica());
assert(cct->_conf->osd_kill_backfill_at != 2);
info.last_backfill = m->last_backfill;
ctx->obc->ssc->snapset = ctx->new_snapset;
info.stats.stats.add(ctx->delta_stats, ctx->obc->obs.oi.category);
+ int backfill_target = get_backfill_target();
if (backfill_target >= 0) {
pg_info_t& pinfo = peer_info[backfill_target];
if (soid <= pinfo.last_backfill)
int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
- if (ctx->op && acting.size() > 1) {
+ assert(actingbackfill.size() > 0);
+ if (ctx->op && actingbackfill.size() > 1) {
ostringstream ss;
- ss << "waiting for subops from " << vector<int>(acting.begin() + 1, acting.end());
+ ss << "waiting for subops from " << vector<int>(actingbackfill.begin() + 1, actingbackfill.end());
ctx->op->mark_sub_op_sent(ss.str());
}
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
pg_info_t &pinfo = peer_info[peer];
repop->waitfor_ack.insert(peer);
assert(0 == "broken implementation, do not use");
}
+ int backfill_target = get_backfill_target();
// ship resulting transaction, log entries, and pg_stats
if (peer == backfill_target && soid > last_backfill_started &&
// only skip normal (not temp pool=-1) objects
// sanity checks
assert(m->map_epoch >= info.history.same_interval_since);
assert(is_active());
- assert(is_replica());
// we better not be missing this.
assert(!pg_log.get_missing().is_missing(soid));
last_complete_ondisk = last_complete;
if (last_complete_ondisk == info.last_update) {
- if (is_replica()) {
+ if (!is_primary()) {
+ // Either we are a replica or backfill target.
// we are fully up to date. tell the primary!
osd->send_message_osd_cluster(get_primary(),
new MOSDPGTrim(get_osdmap()->get_epoch(), info.pgid,
last_complete_ondisk),
get_osdmap()->get_epoch());
- } else if (is_primary()) {
+ } else {
// we are the primary. tell replicas to trim?
if (calc_min_last_complete_ondisk())
trim_peers();
v = pg_log.get_missing().missing.find(oid)->second.have;
dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
- for (unsigned i=1; i<acting.size(); ++i) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); ++i) {
+ int peer = actingbackfill[i];
if (!peer_missing[peer].is_missing(oid)) {
- assert(peer == backfill_target);
+ assert(peer == get_backfill_target());
continue;
}
eversion_t h = peer_missing[peer].missing[oid].have;
cancel_recovery();
}
+// For now only care about a single backfill at a time
void ReplicatedPG::on_activate()
{
- for (unsigned i = 1; i<acting.size(); i++) {
- if (peer_info[acting[i]].last_backfill != hobject_t::get_max()) {
- assert(backfill_target == -1);
- backfill_target = acting[i];
- last_backfill_started = peer_info[acting[i]].last_backfill;
- dout(10) << " chose backfill target osd." << backfill_target
- << " from " << last_backfill_started << dendl;
- }
- }
+ int backfill_target = get_backfill_target();
+ if (backfill_target == -1)
+ return;
+ last_backfill_started = peer_info[backfill_target].last_backfill;
+ assert(last_backfill_started != hobject_t::get_max());
+ dout(10) << " chose backfill target osd." << backfill_target
+ << " from " << last_backfill_started << dendl;
}
void ReplicatedPG::on_change(ObjectStore::Transaction *t)
work_in_progress = true;
bool deferred_backfill = false;
+ int backfill_target = get_backfill_target();
if (recovering.empty() &&
state_test(PG_STATE_BACKFILL) &&
backfill_target >= 0 && started < max &&
const hobject_t& soid, eversion_t v,
PGBackend::RecoveryHandle *h)
{
+ assert(is_primary());
dout(10) << __func__ << ": on " << soid << dendl;
// NOTE: we know we will get a valid oloc off of disk here.
if (!obc) {
pg_log.missing_add(soid, v, eversion_t());
bool uhoh = true;
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
if (!peer_missing[peer].is_missing(soid, v)) {
missing_loc[soid].insert(peer);
missing_loc_sources.insert(peer);
{
int pushes = 0;
// who needs it?
- for (unsigned i=1; i<get_parent()->get_acting().size(); i++) {
- int peer = get_parent()->get_acting()[i];
+ assert(get_parent()->get_actingbackfill().size() > 0);
+ for (unsigned i=1; i<get_parent()->get_actingbackfill().size(); i++) {
+ int peer = get_parent()->get_actingbackfill()[i];
map<int, pg_missing_t>::const_iterator j =
get_parent()->get_peer_missing().find(peer);
assert(j != get_parent()->get_peer_missing().end());
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
// this is FAR from an optimal recovery order. pretty lame, really.
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
+ assert(actingbackfill.size() > 0);
+ for (unsigned i=1; i<actingbackfill.size(); i++) {
+ int peer = actingbackfill[i];
map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
assert(pm != peer_missing.end());
map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
ThreadPool::TPHandle &handle, bool *work_started)
{
dout(10) << "recover_backfill (" << max << ")" << dendl;
+ int backfill_target = get_backfill_target();
assert(backfill_target >= 0);
pg_info_t& pinfo = peer_info[backfill_target];
dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
backfills_in_flight.insert(oid);
- map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_target);
+ map<int, pg_missing_t>::iterator bpm = peer_missing.find(get_backfill_target());
assert(bpm != peer_missing.end());
bpm->second.add(oid, eversion_t(), eversion_t());