might_have_unfound.clear();
backfill_target = -1;
- peer_backfill_info = BackfillInterval();
+ backfill_info.clear();
+ peer_backfill_info.clear();
last_update_ondisk = eversion_t();
all_info[osd->whoami] = info;
for (map<int,Info>::iterator p = all_info.begin(); p != all_info.end(); ++p) {
- dout(10) << "choose_acting osd." << p->first << " " << p->second << dendl;
+ dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
}
// find osd with newest last_update. if there are multiples, prefer
continue;
}
// prefer longer tail, if it will bring another peer in log contiguity
- if (p->second.log_tail < newest_update_osd->second.log_tail) {
- bool worse = false;
- for (map<int,Info>::iterator q = all_info.begin(); q != all_info.end(); ++q) {
- if (q->second.is_incomplete())
- continue; // don't care about log contiguity
- if (q->second.last_update < newest_update_osd->second.log_tail &&
- q->second.last_update >= p->second.log_tail) {
- dout(10) << "choose_acting prefer osd." << p->first
- << " because it brings osd." << q->first << " into log contiguity" << dendl;
- newest_update_osd = p;
- continue;
- }
- if (q->second.last_update < p->second.log_tail &&
- q->second.last_update >= newest_update_osd->second.log_tail) {
- worse = true;
- break;
- }
- }
- if (worse)
+ bool worse = false;
+ for (map<int,Info>::iterator q = all_info.begin(); q != all_info.end(); ++q) {
+ if (q->second.is_incomplete())
+ continue; // don't care about log contiguity
+ if (q->second.last_update < newest_update_osd->second.log_tail &&
+ q->second.last_update >= p->second.log_tail) {
+ dout(10) << "calc_acting prefer osd." << p->first
+ << " because it brings osd." << q->first << " into log contiguity" << dendl;
+ newest_update_osd = p;
continue;
+ }
+ if (q->second.last_update < p->second.log_tail &&
+ q->second.last_update >= newest_update_osd->second.log_tail) {
+ worse = true;
+ break;
+ }
}
+ if (worse)
+ continue;
+
// prefer current primary (usually the caller), all things being equal
if (p->first == acting[0]) {
- dout(10) << "choose_acting prefer osd." << p->first
+ dout(10) << "calc_acting prefer osd." << p->first
<< " because it is current primary" << dendl;
newest_update_osd = p;
continue;
}
}
- dout(10) << "choose_acting newest update on osd." << newest_update_osd->first
+ dout(10) << "calc_acting newest update on osd." << newest_update_osd->first
<< " with " << newest_update_osd->second << dendl;
newest_update_osd_id = newest_update_osd->first;
if (p->second.is_incomplete())
continue;
if (primary->second.is_incomplete()) {
- dout(10) << "choose_acting prefer osd." << p->first << " because not incomplete" << dendl;
+ dout(10) << "calc_acting prefer osd." << p->first << " because not incomplete" << dendl;
primary = p;
continue;
}
if (p->second.last_update < newest_update_osd->second.log_tail)
continue;
if (primary->second.last_update < newest_update_osd->second.log_tail) {
- dout(10) << "choose_acting prefer osd." << p->first
+ dout(10) << "calc_acting prefer osd." << p->first
<< " because log contiguous with newest osd." << newest_update_osd->first << dendl;
primary = p;
continue;
if (!q->second.is_incomplete() &&
q->second.last_update < primary->second.log_tail &&
q->second.last_update >= p->second.log_tail) {
- dout(10) << "choose_acting prefer osd." << p->first
+ dout(10) << "calc_acting prefer osd." << p->first
<< " because it brings osd." << q->first << " into log contiguity" << dendl;
primary = p;
continue;
if (primary->second.is_incomplete() ||
primary->second.last_update < newest_update_osd->second.log_tail) {
- dout(10) << "choose_acting no acceptable primary, reverting to up " << up << dendl;
+ dout(10) << "calc_acting no acceptable primary, reverting to up " << up << dendl;
want = up;
return;
}
- dout(10) << "choose_acting primary is osd." << primary->first
+ dout(10) << "calc_acting primary is osd." << primary->first
<< " with " << primary->second << dendl;
want.push_back(primary->first);
+ unsigned usable = 1;
// select replicas that have log contiguity with primary
for (vector<int>::const_iterator i = up.begin();
want.push_back(*i);
} else {
want.push_back(*i);
+ usable++;
dout(10) << " osd." << *i << " (up) accepted " << cur_info << dendl;
}
}
for (map<int,Info>::const_iterator i = all_info.begin();
i != all_info.end();
++i) {
- if (want.size() >= get_osdmap()->get_pg_size(info.pgid))
+ if (usable >= get_osdmap()->get_pg_size(info.pgid))
break;
// skip up osds we already considered above
}
last_update_applied = info.last_update;
- assert(info.last_complete >= log.tail && !info.is_incomplete());
+ assert(info.last_complete >= log.tail);
need_up_thru = false;
assert(peer_info.count(peer));
PG::Info& pi = peer_info[peer];
- MOSDPGLog *m = 0;
-
dout(10) << "activate peer osd." << peer << " " << pi << dendl;
if (log.tail > pi.last_update) {
// reset, backfill
pi.last_update = info.last_update;
pi.last_complete = info.last_complete;
+ pi.log_tail = info.last_update;
pi.last_backfill = hobject_t();
pi.history = info.history;
(*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch());
(*activator_map)[peer]->pg_info.push_back(pi);
} else {
- m = new MOSDPGLog(get_osdmap()->get_epoch(), pi);
- osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer));
+ MOSDPGInfo *mp = new MOSDPGInfo(get_osdmap()->get_epoch());
+ mp->pg_info.push_back(pi);
+ osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(peer));
}
continue;
}
+ MOSDPGLog *m = 0;
if (pi.last_update == info.last_update) {
// empty log
if (!pi.is_empty() && activator_map) {
boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
{
PG *pg = context< RecoveryMachine >().pg;
- dout(10) << "got info from osd." << infoevt.from << dendl;
+ dout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
- if (pg->is_replica()) {
- assert(pg->log.tail <= pg->info.last_complete);
- assert(pg->log.head == pg->info.last_update);
- post_event(Activate());
- } else {
- // pg creation for backfill
- dout(10) << "updating info to " << infoevt.info << dendl;
+ if (infoevt.info.last_update != pg->info.last_update) {
+ dout(10) << " reset for backfill" << dendl;
pg->info = infoevt.info;
-
- ObjectStore::Transaction *t = new ObjectStore::Transaction;
- pg->write_info(*t);
- int tr = pg->osd->store->queue_transaction(&pg->osr, t);
- assert(tr == 0);
+ assert(pg->info.log_tail == pg->info.last_update);
+ assert(pg->info.last_backfill == hobject_t());
+ pg->log.clear();
+ pg->log.head = pg->info.last_update;
+ pg->log.tail = pg->info.last_update;
}
+
+ assert(pg->log.tail <= pg->info.last_complete);
+ assert(pg->log.head == pg->info.last_update);
+
+ post_event(Activate());
return discard_event();
}
case CEPH_OSD_OP_PUSH:
sub_op_push(op);
return;
+ case CEPH_OSD_OP_DELETE:
+ sub_op_remove(op);
+ return;
case CEPH_OSD_OP_SCRUB_RESERVE:
sub_op_scrub_reserve(op);
return;
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
-
+ Info &pinfo = peer_info[peer];
+
repop->waitfor_ack.insert(peer);
repop->waitfor_disk.insert(peer);
wr->set_data(repop->ctx->op->get_data()); // _copy_ bufferlist
} else {
// ship resulting transaction, log entries, and pg_stats
- ::encode(repop->ctx->op_t, wr->get_data());
+ if (soid > pinfo.last_backfill) {
+ dout(10) << "issue_repop shipping empty opt to osd." << peer << ", object beyond last_backfill "
+ << pinfo.last_backfill << dendl;
+ ObjectStore::Transaction t;
+ ::encode(t, wr->get_data());
+ } else {
+ ::encode(repop->ctx->op_t, wr->get_data());
+ }
::encode(repop->ctx->log, wr->logbl);
wr->pg_stats = info.stats;
}
osd->cluster_messenger->send_message(wr, get_osdmap()->get_cluster_inst(peer));
// keep peer_info up to date
- Info &in = peer_info[peer];
- in.last_update = ctx->at_version;
- if (in.last_complete == old_last_update)
- in.last_update = ctx->at_version;
+ if (pinfo.last_complete == pinfo.last_update)
+ pinfo.last_update = ctx->at_version;
+ pinfo.last_update = ctx->at_version;
}
}
pi->data_subset_pushing, pi->clone_subsets);
} else {
// done!
- if (peer_missing[peer].is_missing(soid)) // so that we ignore backfill; imprecise!
- peer_missing[peer].got(soid, pi->version);
+ peer_missing[peer].got(soid, pi->version);
pushing[soid].erase(peer);
pi = NULL;
op->put();
}
+void ReplicatedPG::sub_op_remove(MOSDSubOp *op)
+{
+ dout(7) << "sub_op_remove " << op->poid << dendl;
+
+ ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ remove_object_with_snap_hardlinks(*t, op->poid);
+ int r = osd->store->queue_transaction(&osr, t);
+ assert(r == 0);
+
+ op->put();
+}
eversion_t ReplicatedPG::pick_newest_available(const hobject_t& oid)
Info& pinfo = peer_info[backfill_target];
BackfillInterval& pbi = peer_backfill_info;
- dout(10) << " peer osd." << backfill_target << " " << pinfo
+ hobject_t pos = pbi.begin;
+
+ dout(10) << " peer osd." << backfill_target
+ << " pos " << pos
+ << " info " << pinfo
<< " interval " << pbi.begin << "-" << pbi.end
<< " " << pbi.objects.size() << " objects" << dendl;
- // does the pg exist yet on the peer?
- if (pinfo.dne()) {
- // ok, we know they have no objects.
- pbi.end = hobject_t::get_max();
-
- // fill in pinfo
- pinfo.last_update = info.last_update;
- pinfo.log_tail = info.last_update;
- pinfo.last_backfill = hobject_t();
- pinfo.history = info.history;
- dout(10) << " peer osd." << backfill_target << " pg dne; setting info to " << pinfo << dendl;
-
- // create pg on remote
- MOSDPGInfo *mp = new MOSDPGInfo(get_osdmap()->get_epoch());
- mp->pg_info.push_back(pinfo);
- osd->cluster_messenger->send_message(mp, get_osdmap()->get_cluster_inst(backfill_target));
- }
+ // re-scan our local interval to cope with recent changes
+ dout(10) << " rescanning local backfill_info from " << pos << dendl;
+ backfill_info.clear();
+ osr.flush();
+ scan_range(pos, 10, 20, &backfill_info);
int ops = 0;
while (ops < max) {
- if (!backfill_info.at_end() && (backfill_info.end <= pbi.begin ||
+ if (!backfill_info.extends_to_end() && (backfill_info.end <= pbi.begin ||
backfill_info.empty())) {
osr.flush();
scan_range(backfill_info.end, 10, 20, &backfill_info);
<< " " << backfill_info.objects << dendl;
dout(20) << " peer backfill " << pbi.begin << "-" << pbi.end << " " << pbi.objects << dendl;
- if (!pbi.at_end() && (pbi.end <= backfill_info.begin ||
+ if (!pbi.extends_to_end() && (pbi.end <= backfill_info.begin ||
pbi.empty())) {
epoch_t e = get_osdmap()->get_epoch();
MOSDPGScan *m = new MOSDPGScan(MOSDPGScan::OP_SCAN_GET_DIGEST, e, e, info.pgid,
if (backfill_info.empty()) {
// this only happens when we reach the end of the collection.
- assert(backfill_info.at_end());
+ assert(backfill_info.extends_to_end());
if (pbi.empty()) {
- assert(pbi.at_end());
+ assert(pbi.extends_to_end());
dout(10) << " reached end for both local and peer" << dendl;
if (pbi.begin != hobject_t::get_max()) {
pbi.begin = hobject_t::get_max();
eversion_t mv = backfill_info.objects.begin()->second;
if (pbi.empty()) {
- assert(pbi.at_end());
+ assert(pbi.extends_to_end());
dout(20) << " pushing local " << my_first << " " << backfill_info.objects.begin()->second
<< " to peer osd." << backfill_target << dendl;
- push_backfill_object(my_first, mv, backfill_target);
+ push_backfill_object(my_first, mv, eversion_t(), backfill_target);
backfill_info.pop_front();
pbi.begin = my_first;
++ops;
dout(20) << " removing peer " << peer_first << " <= local " << my_first << dendl;
send_remove_op(peer_first, pv, backfill_target);
pbi.pop_front();
+ if (pbi.begin < backfill_info.begin)
+ pbi.begin = backfill_info.begin;
} else if (peer_first == my_first) {
if (pv == mv) {
dout(20) << " keeping peer " << peer_first << " " << pv << dendl;
} else {
dout(20) << " replacing peer " << peer_first << " with local " << mv << dendl;
- push_backfill_object(my_first, mv, backfill_target);
+ push_backfill_object(my_first, mv, pv, backfill_target);
++ops;
}
pbi.pop_front();
backfill_info.pop_front();
+ if (pbi.begin < backfill_info.begin)
+ pbi.begin = backfill_info.begin;
} else {
// peer_first > my_first
dout(20) << " pushing local " << my_first << " " << mv
<< " to peer osd." << backfill_target << dendl;
- push_backfill_object(my_first, mv, backfill_target);
+ push_backfill_object(my_first, mv, eversion_t(), backfill_target);
backfill_info.pop_front();
++ops;
}
hobject_t bound = pbi.begin;
bound.back_up_to_bounding_key();
if (pinfo.last_backfill < bound) {
- pinfo.last_backfill = bound;
-
dout(10) << " peer osd." << backfill_target << " info.last_backfill now " << pinfo.last_backfill << dendl;
epoch_t e = get_osdmap()->get_epoch();
MOSDPGBackfill *m = new MOSDPGBackfill(MOSDPGBackfill::OP_BACKFILL_PROGRESS, e, e, info.pgid);
- m->last_backfill = pinfo.last_backfill;
+ m->last_backfill = bound;
osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(backfill_target));
}
return ops;
}
-void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, int peer)
+void ReplicatedPG::push_backfill_object(hobject_t oid, eversion_t v, eversion_t have, int peer)
{
dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
+
+ // object is now below the waterline; mark it missing.
+ peer_info[peer].last_backfill = oid;
+ peer_missing[peer].add(oid, v, have);
+
start_recovery_op(oid);
ObjectContext *obc = get_object_context(oid, OLOC_BLANK, false);
obc->ondisk_read_lock();
assert(is_locked());
dout(10) << "scan_range from " << begin << dendl;
bi->begin = begin;
+ bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);